Upload 37 files
Browse files- .gitattributes +1 -0
- Dockerfile +43 -0
- app.py +345 -0
- data/prebuilt/corpus/chunk_entities.json +1 -0
- data/prebuilt/corpus/chunks.jsonl +0 -0
- data/prebuilt/corpus/documents.jsonl +18 -0
- data/prebuilt/corpus/entities.jsonl +264 -0
- data/prebuilt/corpus/relations.jsonl +180 -0
- data/prebuilt/corpus/stats.json +7 -0
- data/prebuilt/graph/communities.json +693 -0
- data/prebuilt/graph/graph_edges.jsonl +0 -0
- data/prebuilt/graph/graph_nodes.jsonl +0 -0
- data/prebuilt/index/community_embeddings.pkl +3 -0
- data/prebuilt/index/dense_index.faiss +3 -0
- data/prebuilt/index/dense_index.meta.pkl +3 -0
- data/prebuilt/index/sparse_index.pkl +3 -0
- graphrag_v4/__init__.py +14 -0
- graphrag_v4/__pycache__/__init__.cpython-311.pyc +0 -0
- graphrag_v4/__pycache__/chunking.cpython-311.pyc +0 -0
- graphrag_v4/__pycache__/corpus_builder.cpython-311.pyc +0 -0
- graphrag_v4/__pycache__/embeddings.cpython-311.pyc +0 -0
- graphrag_v4/__pycache__/extraction.cpython-311.pyc +0 -0
- graphrag_v4/__pycache__/graph_builder.cpython-311.pyc +0 -0
- graphrag_v4/__pycache__/models.cpython-311.pyc +0 -0
- graphrag_v4/__pycache__/qa.cpython-311.pyc +0 -0
- graphrag_v4/__pycache__/retriever.cpython-311.pyc +0 -0
- graphrag_v4/__pycache__/visualization.cpython-311.pyc +0 -0
- graphrag_v4/chunking.py +232 -0
- graphrag_v4/corpus_builder.py +250 -0
- graphrag_v4/embeddings.py +127 -0
- graphrag_v4/extraction.py +428 -0
- graphrag_v4/graph_builder.py +501 -0
- graphrag_v4/models.py +167 -0
- graphrag_v4/qa.py +181 -0
- graphrag_v4/retriever.py +445 -0
- graphrag_v4/visualization.py +719 -0
- prebuild.py +163 -0
- requirements.txt +30 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/prebuilt/index/dense_index.faiss filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# System dependencies for PyMuPDF, FAISS, and igraph
|
| 4 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 5 |
+
build-essential \
|
| 6 |
+
libglib2.0-0 \
|
| 7 |
+
libsm6 \
|
| 8 |
+
libxext6 \
|
| 9 |
+
libxrender-dev \
|
| 10 |
+
libmupdf-dev \
|
| 11 |
+
git \
|
| 12 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
# Create non-root user (required by HF Spaces)
|
| 15 |
+
RUN useradd -m -u 1000 user
|
| 16 |
+
WORKDIR /home/user/app
|
| 17 |
+
|
| 18 |
+
# Install Python dependencies in layers for caching
|
| 19 |
+
COPY requirements.txt .
|
| 20 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 21 |
+
|
| 22 |
+
# Copy application code
|
| 23 |
+
COPY graphrag_v4/ ./graphrag_v4/
|
| 24 |
+
COPY app.py .
|
| 25 |
+
COPY prebuild.py .
|
| 26 |
+
|
| 27 |
+
# Copy pre-built graph and index if present.
|
| 28 |
+
# If data/prebuilt/ doesn't exist at build time, this is a no-op
|
| 29 |
+
# and the demo falls back to upload mode.
|
| 30 |
+
COPY data/prebuilt/ ./data/prebuilt/
|
| 31 |
+
|
| 32 |
+
# Switch to non-root user
|
| 33 |
+
USER user
|
| 34 |
+
|
| 35 |
+
EXPOSE 7860
|
| 36 |
+
|
| 37 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 38 |
+
GRADIO_SERVER_NAME=0.0.0.0 \
|
| 39 |
+
GRADIO_SERVER_PORT=7860 \
|
| 40 |
+
HF_HOME=/home/user/.cache/huggingface \
|
| 41 |
+
PREBUILT_DIR=./data/prebuilt
|
| 42 |
+
|
| 43 |
+
CMD ["python", "app.py"]
|
app.py
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""GraphRAG v4 - Gradio Chat Demo for HF Spaces
|
| 3 |
+
|
| 4 |
+
Features:
|
| 5 |
+
- Upload PDFs → auto-builds corpus + knowledge graph
|
| 6 |
+
- Chat interface for Q&A over the graph
|
| 7 |
+
- Shows retrieved sources, graph stats, community context
|
| 8 |
+
- Works on CPU (HF Spaces free tier) and GPU
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import json
|
| 12 |
+
import os
|
| 13 |
+
import shutil
|
| 14 |
+
import tempfile
|
| 15 |
+
import time
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import List, Tuple
|
| 18 |
+
|
| 19 |
+
import gradio as gr
|
| 20 |
+
|
| 21 |
+
# ── Global state ─────────────────────────────────────────────────────
|
| 22 |
+
|
| 23 |
+
CORPUS_DIR = Path("/tmp/graphrag_corpus")
|
| 24 |
+
INDEX_DIR = Path("/tmp/graphrag_index")
|
| 25 |
+
|
| 26 |
+
_builder = None
|
| 27 |
+
_retriever = None
|
| 28 |
+
_qa = None
|
| 29 |
+
_system_ready = False
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_system_status() -> str:
|
| 33 |
+
if not _system_ready:
|
| 34 |
+
return "⏳ No corpus loaded. Upload PDF files to get started."
|
| 35 |
+
stats = _builder.get_stats()
|
| 36 |
+
return (
|
| 37 |
+
f"✅ System ready\n"
|
| 38 |
+
f"📊 Nodes: {stats['total_nodes']} | Edges: {stats['total_edges']} | "
|
| 39 |
+
f"Communities: {stats['communities']} | Cross-doc entities: {stats.get('cross_doc_entities', 0)}"
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# ── Corpus building ─────────────────────────────────────────────────
|
| 44 |
+
|
| 45 |
+
def process_uploads(files, progress=gr.Progress()) -> str:
|
| 46 |
+
"""Process uploaded PDFs → build corpus → build KG → index."""
|
| 47 |
+
global _builder, _retriever, _qa, _system_ready
|
| 48 |
+
|
| 49 |
+
if not files:
|
| 50 |
+
return "❌ No files uploaded."
|
| 51 |
+
|
| 52 |
+
_system_ready = False
|
| 53 |
+
start = time.time()
|
| 54 |
+
log_lines = []
|
| 55 |
+
|
| 56 |
+
def log(msg):
|
| 57 |
+
log_lines.append(msg)
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
# Clean previous state
|
| 61 |
+
if CORPUS_DIR.exists():
|
| 62 |
+
shutil.rmtree(CORPUS_DIR)
|
| 63 |
+
if INDEX_DIR.exists():
|
| 64 |
+
shutil.rmtree(INDEX_DIR)
|
| 65 |
+
|
| 66 |
+
# Copy uploaded PDFs
|
| 67 |
+
pdf_dir = CORPUS_DIR / "pdfs"
|
| 68 |
+
pdf_dir.mkdir(parents=True, exist_ok=True)
|
| 69 |
+
|
| 70 |
+
pdf_count = 0
|
| 71 |
+
for f in files:
|
| 72 |
+
src = Path(f.name) if hasattr(f, 'name') else Path(f)
|
| 73 |
+
if src.suffix.lower() == ".pdf":
|
| 74 |
+
dst = pdf_dir / src.name
|
| 75 |
+
shutil.copy2(str(src), str(dst))
|
| 76 |
+
pdf_count += 1
|
| 77 |
+
log(f"📄 {src.name}")
|
| 78 |
+
|
| 79 |
+
if pdf_count == 0:
|
| 80 |
+
return "❌ No PDF files found in upload."
|
| 81 |
+
|
| 82 |
+
log(f"\n🔧 Processing {pdf_count} PDF(s)...")
|
| 83 |
+
progress(0.1, desc="Building corpus...")
|
| 84 |
+
|
| 85 |
+
# Step 1: Build corpus
|
| 86 |
+
from graphrag_v4.corpus_builder import build_corpus
|
| 87 |
+
corpus_out = CORPUS_DIR / "output"
|
| 88 |
+
build_corpus(input_path=pdf_dir, output_dir=corpus_out, max_chunk_tokens=384)
|
| 89 |
+
|
| 90 |
+
stats_file = corpus_out / "stats.json"
|
| 91 |
+
if stats_file.exists():
|
| 92 |
+
stats = json.loads(stats_file.read_text())
|
| 93 |
+
log(f" Chunks: {stats['total_chunks']}, Entities: {stats['total_entities']}, Relations: {stats['total_relations']}")
|
| 94 |
+
else:
|
| 95 |
+
return "❌ Corpus building failed — no output produced."
|
| 96 |
+
|
| 97 |
+
progress(0.4, desc="Building knowledge graph...")
|
| 98 |
+
|
| 99 |
+
# Step 2: Build knowledge graph
|
| 100 |
+
from graphrag_v4.graph_builder import KnowledgeGraphBuilder
|
| 101 |
+
_builder = KnowledgeGraphBuilder()
|
| 102 |
+
_builder.load_corpus(corpus_out)
|
| 103 |
+
_builder.build_cooccurrence_edges()
|
| 104 |
+
cross_doc = _builder.build_cross_document_edges()
|
| 105 |
+
_builder.compute_pagerank()
|
| 106 |
+
_builder.detect_communities(n_levels=2)
|
| 107 |
+
_builder.generate_community_summaries()
|
| 108 |
+
|
| 109 |
+
kg_stats = _builder.get_stats()
|
| 110 |
+
log(f"\n🕸️ Knowledge Graph:")
|
| 111 |
+
log(f" Nodes: {kg_stats['total_nodes']}, Edges: {kg_stats['total_edges']}")
|
| 112 |
+
log(f" Communities: {kg_stats['communities']}, Cross-doc edges: {cross_doc}")
|
| 113 |
+
|
| 114 |
+
progress(0.7, desc="Building search index...")
|
| 115 |
+
|
| 116 |
+
# Step 3: Build retriever
|
| 117 |
+
from graphrag_v4.retriever import HybridRetriever
|
| 118 |
+
_retriever = HybridRetriever(graph=_builder.graph, communities=_builder.communities)
|
| 119 |
+
|
| 120 |
+
chunks = []
|
| 121 |
+
chunks_file = corpus_out / "chunks.jsonl"
|
| 122 |
+
if chunks_file.exists():
|
| 123 |
+
with open(chunks_file, "r", encoding="utf-8") as fh:
|
| 124 |
+
for line in fh:
|
| 125 |
+
chunks.append(json.loads(line))
|
| 126 |
+
|
| 127 |
+
_retriever.index_chunks(chunks)
|
| 128 |
+
_retriever.index_communities(_builder.communities)
|
| 129 |
+
|
| 130 |
+
progress(0.9, desc="Initializing QA...")
|
| 131 |
+
|
| 132 |
+
# Step 4: Initialize QA
|
| 133 |
+
from graphrag_v4.qa import GraphRAGQA, LLMClient
|
| 134 |
+
llm = None
|
| 135 |
+
api_key = os.environ.get("OPENAI_API_KEY")
|
| 136 |
+
if api_key:
|
| 137 |
+
llm = LLMClient(api_key=api_key)
|
| 138 |
+
if llm.available:
|
| 139 |
+
log(f"\n🤖 LLM: {llm.model} connected")
|
| 140 |
+
else:
|
| 141 |
+
llm = None
|
| 142 |
+
|
| 143 |
+
_qa = GraphRAGQA(retriever=_retriever, llm_client=llm)
|
| 144 |
+
|
| 145 |
+
_system_ready = True
|
| 146 |
+
elapsed = time.time() - start
|
| 147 |
+
log(f"\n✅ Ready in {elapsed:.1f}s")
|
| 148 |
+
|
| 149 |
+
progress(1.0, desc="Done!")
|
| 150 |
+
return "\n".join(log_lines)
|
| 151 |
+
|
| 152 |
+
except Exception as e:
|
| 153 |
+
return f"❌ Error: {str(e)}"
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
# ── Chat ─────────────────────────────────────────────────────────────
|
| 157 |
+
|
| 158 |
+
def chat_respond(message: str, history: List) -> Tuple[List, str]:
|
| 159 |
+
"""Handle chat messages — search + QA over the knowledge graph."""
|
| 160 |
+
if not _system_ready or _qa is None:
|
| 161 |
+
history.append((message, "⚠️ Please upload PDF files first using the panel on the left."))
|
| 162 |
+
return history, ""
|
| 163 |
+
|
| 164 |
+
try:
|
| 165 |
+
start = time.time()
|
| 166 |
+
result = _qa.answer(message, top_k=8, use_communities=True)
|
| 167 |
+
elapsed = time.time() - start
|
| 168 |
+
|
| 169 |
+
# Build response with sources
|
| 170 |
+
response_parts = [result.answer]
|
| 171 |
+
|
| 172 |
+
if result.sources:
|
| 173 |
+
response_parts.append("\n\n---\n**📚 Sources:**")
|
| 174 |
+
for i, src in enumerate(result.sources[:5], 1):
|
| 175 |
+
scores = []
|
| 176 |
+
if src.dense_score > 0:
|
| 177 |
+
scores.append(f"dense={src.dense_score:.2f}")
|
| 178 |
+
if src.sparse_score > 0:
|
| 179 |
+
scores.append(f"sparse={src.sparse_score:.2f}")
|
| 180 |
+
if src.graph_score > 0:
|
| 181 |
+
scores.append(f"graph={src.graph_score:.2f}")
|
| 182 |
+
score_str = f" ({', '.join(scores)})" if scores else ""
|
| 183 |
+
response_parts.append(
|
| 184 |
+
f"{i}. **{src.title[:60]}** — p.{src.page} — score: {src.score:.4f}{score_str}"
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
response_parts.append(f"\n*Confidence: {result.confidence:.0%} | Time: {elapsed:.2f}s | {', '.join(result.reasoning)}*")
|
| 188 |
+
|
| 189 |
+
response = "\n".join(response_parts)
|
| 190 |
+
history.append((message, response))
|
| 191 |
+
|
| 192 |
+
except Exception as e:
|
| 193 |
+
history.append((message, f"❌ Error: {str(e)}"))
|
| 194 |
+
|
| 195 |
+
return history, ""
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def get_graph_info() -> str:
|
| 199 |
+
"""Get current graph statistics for the info panel."""
|
| 200 |
+
if not _system_ready or _builder is None:
|
| 201 |
+
return "No graph loaded."
|
| 202 |
+
|
| 203 |
+
stats = _builder.get_stats()
|
| 204 |
+
lines = [
|
| 205 |
+
"## 📊 Knowledge Graph Statistics\n",
|
| 206 |
+
f"**Nodes:** {stats['total_nodes']}",
|
| 207 |
+
]
|
| 208 |
+
|
| 209 |
+
if stats.get('node_types'):
|
| 210 |
+
for nt, count in stats['node_types'].items():
|
| 211 |
+
lines.append(f" - {nt}: {count}")
|
| 212 |
+
|
| 213 |
+
lines.append(f"\n**Edges:** {stats['total_edges']}")
|
| 214 |
+
if stats.get('edge_types'):
|
| 215 |
+
for et, count in stats['edge_types'].items():
|
| 216 |
+
lines.append(f" - {et}: {count}")
|
| 217 |
+
|
| 218 |
+
lines.append(f"\n**Communities:** {stats['communities']}")
|
| 219 |
+
lines.append(f"**Cross-doc entities:** {stats.get('cross_doc_entities', 0)}")
|
| 220 |
+
|
| 221 |
+
# Show some communities
|
| 222 |
+
if _builder.communities:
|
| 223 |
+
lines.append("\n## 🏘️ Top Communities\n")
|
| 224 |
+
sorted_comms = sorted(_builder.communities.values(), key=lambda c: c.size, reverse=True)
|
| 225 |
+
for comm in sorted_comms[:5]:
|
| 226 |
+
entities = ", ".join(comm.key_entities[:3]) if comm.key_entities else "—"
|
| 227 |
+
lines.append(f"**{comm.community_id}** ({comm.size} nodes): {entities}")
|
| 228 |
+
|
| 229 |
+
return "\n".join(lines)
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
# ── Gradio UI ────────────────────────────────────────────────────────
|
| 233 |
+
|
| 234 |
+
def build_ui():
|
| 235 |
+
with gr.Blocks(
|
| 236 |
+
title="GraphRAG v4 — Cross-Document Knowledge Graph QA",
|
| 237 |
+
theme=gr.themes.Soft(
|
| 238 |
+
primary_hue="blue",
|
| 239 |
+
secondary_hue="slate",
|
| 240 |
+
),
|
| 241 |
+
css="""
|
| 242 |
+
.main-header { text-align: center; margin-bottom: 1rem; }
|
| 243 |
+
.upload-section { border: 2px dashed #4a90d9; border-radius: 12px; padding: 1rem; }
|
| 244 |
+
footer { display: none !important; }
|
| 245 |
+
"""
|
| 246 |
+
) as demo:
|
| 247 |
+
gr.Markdown(
|
| 248 |
+
"""
|
| 249 |
+
# 🕸️ GraphRAG v4 — Cross-Document Knowledge Graph QA
|
| 250 |
+
Upload PDF documents → automatic KG construction → ask questions over the graph.
|
| 251 |
+
|
| 252 |
+
**Pipeline:** PDF → Chunks → GLiNER Entities → Proximity Relations → Cross-Doc Linking → Leiden Communities → BGE-M3 Hybrid Search → Answer
|
| 253 |
+
""",
|
| 254 |
+
elem_classes="main-header"
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
with gr.Row():
|
| 258 |
+
# Left panel: Upload + Status
|
| 259 |
+
with gr.Column(scale=1):
|
| 260 |
+
gr.Markdown("### 📁 Document Upload")
|
| 261 |
+
upload = gr.File(
|
| 262 |
+
file_count="multiple",
|
| 263 |
+
file_types=[".pdf"],
|
| 264 |
+
label="Upload PDFs",
|
| 265 |
+
elem_classes="upload-section",
|
| 266 |
+
)
|
| 267 |
+
build_btn = gr.Button("🔧 Build Knowledge Graph", variant="primary", size="lg")
|
| 268 |
+
build_log = gr.Textbox(
|
| 269 |
+
label="Build Log",
|
| 270 |
+
lines=12,
|
| 271 |
+
interactive=False,
|
| 272 |
+
show_copy_button=True,
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
with gr.Accordion("📊 Graph Info", open=False):
|
| 276 |
+
graph_info = gr.Markdown("No graph loaded.")
|
| 277 |
+
refresh_btn = gr.Button("🔄 Refresh Stats", size="sm")
|
| 278 |
+
|
| 279 |
+
# Right panel: Chat
|
| 280 |
+
with gr.Column(scale=2):
|
| 281 |
+
gr.Markdown("### 💬 Ask Questions")
|
| 282 |
+
chatbot = gr.Chatbot(
|
| 283 |
+
label="GraphRAG Chat",
|
| 284 |
+
height=500,
|
| 285 |
+
show_copy_button=True,
|
| 286 |
+
bubble_full_width=False,
|
| 287 |
+
)
|
| 288 |
+
with gr.Row():
|
| 289 |
+
msg = gr.Textbox(
|
| 290 |
+
placeholder="Ask a question about your documents...",
|
| 291 |
+
label="",
|
| 292 |
+
scale=5,
|
| 293 |
+
show_label=False,
|
| 294 |
+
)
|
| 295 |
+
send_btn = gr.Button("Send", variant="primary", scale=1)
|
| 296 |
+
|
| 297 |
+
clear_btn = gr.Button("🗑️ Clear Chat", size="sm")
|
| 298 |
+
|
| 299 |
+
# Example questions
|
| 300 |
+
gr.Markdown("### 💡 Example Questions")
|
| 301 |
+
gr.Examples(
|
| 302 |
+
examples=[
|
| 303 |
+
"Які документи потрібні для вступу?",
|
| 304 |
+
"Хто є гарантом освітньої програми?",
|
| 305 |
+
"Вимоги до бакалавра з кібербезпеки",
|
| 306 |
+
"What are the admission requirements?",
|
| 307 |
+
"Summarize the main topics across all documents",
|
| 308 |
+
],
|
| 309 |
+
inputs=msg,
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
# Events
|
| 313 |
+
build_btn.click(
|
| 314 |
+
fn=process_uploads,
|
| 315 |
+
inputs=[upload],
|
| 316 |
+
outputs=[build_log],
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
send_btn.click(
|
| 320 |
+
fn=chat_respond,
|
| 321 |
+
inputs=[msg, chatbot],
|
| 322 |
+
outputs=[chatbot, msg],
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
msg.submit(
|
| 326 |
+
fn=chat_respond,
|
| 327 |
+
inputs=[msg, chatbot],
|
| 328 |
+
outputs=[chatbot, msg],
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
clear_btn.click(fn=lambda: [], outputs=[chatbot])
|
| 332 |
+
refresh_btn.click(fn=get_graph_info, outputs=[graph_info])
|
| 333 |
+
|
| 334 |
+
return demo
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
# ── Entry point ──────────────────────────────────────────────────────
|
| 338 |
+
|
| 339 |
+
if __name__ == "__main__":
|
| 340 |
+
demo = build_ui()
|
| 341 |
+
demo.launch(
|
| 342 |
+
server_name="0.0.0.0",
|
| 343 |
+
server_port=7860,
|
| 344 |
+
share=False,
|
| 345 |
+
)
|
data/prebuilt/corpus/chunk_entities.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"c0c2349542f40602": ["58293efde3d65766", "6ee638ce41216594", "7b046ef3d57edb7a"], "aec957bff3333594": ["c2f2ef49af307713", "bfa00e8db74c2bff", "250d9f3642fd0543", "6ee638ce41216594", "aa046696ad983530", "b972a394b89fc296"], "55caa1ade06bdf64": ["5d47659c01a5bfe0", "f9491e067e6094a3", "2396ca191bb74a2f", "8ef745e21a875f72", "5bce0cd172cf9389", "dd260601f883f45b", "319103bce9ac4ecb", "3324747a90504200", "d6caffc95d4ea230"], "7d2fabee274a867b": ["4e3a6a778bd02a9a", "4608f9da19534fdf", "54e5434714345859", "4fed50ed9d1c3c43", "37948bdfda119408", "44ff037b4be33247", "c50aef3ee34c16b0", "319103bce9ac4ecb", "290254ba61badf4d"], "ff9c88b3a7fb7942": ["9ae9a1669500ffa1", "d5000b0072dd6b2e"], "bef65c47c7457b9b": ["4e1c9e09669d9739", "319103bce9ac4ecb", "71016125a13c0e0d"], "9e41a312d020a671": ["b88bcdbd0c134780"], "7ba9bd963e52750c": ["aea9a80c0b30ac81", "1b9a3e20d4e2de7d"], "98bae9ab7f84fab6": ["48e8eb52059e5bab", "272fc3de5af4b271", "206d4facf8046312", "b88bcdbd0c134780"], "e2a880a0cee5051c": ["938619211ac368a8"], "d89f7445018e510e": ["0968094acb78c1ad", "97722021b1a2792b"], "17fca38c5fe9bbe2": ["a7c0a7066c2e9018", "206d761f50f8f06d"], "b29667928e818bb6": ["fc7e425c76ed0914", "b5dc06eb5bd93776", "245ea398e76f2b9d", "1d00762131f7927c", "3d510e2163671925", "9b1d2e56d20f1ed0", "0d39e7b35c5a076e"], "c0031f76c1b5be0a": ["a0afc115cc35343d", "cc66c883cdb05b6c", "913d7e8e7c2f67f4"], "3725adad1a375aae": ["06e267506e80e944", "f74e210776727dd2", "922d634207f1bd3b"], "ef794c82b54bd7fa": ["615aeabe0d2e90c4", "5500e778ecf224d5"], "d95be0bb9a2a5e85": ["34c3561b548abce9"], "2a766068f1b63784": ["79f52309b208feba"], "82e419ef4b6a4f3a": ["c6a80669b78e38d0", "db6dd07ac3e3c6b9", "bd8c382a93b83173", "41dd209cb0943b81", "409f31faf973efbb", "76c1b09c6ba5c29d", "8c57b3adbf877d92", "122b3fe2e920d773", "c8c030a0bccb918c", "8106aa9561ab7c2f"], "e6d769da18887031": ["58293efde3d65766"], "d42293ae57732392": ["ceae3c879404fdc2", "eae30d306871e586", "01c24beb989b4039", "250d9f3642fd0543", "b445c482d619deb7", "986d05c4d03adaf6", "2c8678efa06f67f9", "6dbca2dfad017f88"], "f648911d33733b47": ["dd260601f883f45b", "5d47659c01a5bfe0", "f9491e067e6094a3", "8fe70805d62442fe", "8ef745e21a875f72", "5bce0cd172cf9389", "496bd95e14f270d0", "346996767a29a82b"], "83fdca75c9a45216": ["cf3cd7d2b4d5ff70", "c4f97b6cbc55754d", "4608f9da19534fdf", "055172975b24840d", "d5000b0072dd6b2e", "8fe70805d62442fe", "d3934aa41810502a", "4ce0475ef4976b1f"], "07abb3f4f4f1acfc": ["4c8642012c1b3a70", "867422b0430e1564"], "60ad204aaed1308f": ["19f4b16b62f7d2ac", "5893788174cc1112", "05fd3595bf5c36de"], "ca545a46260dd835": ["374bf375044352a6", "5e1dde48ef03f452", "5d79062899f094ab"], "a85130ce52e9d5ad": ["825552a12c15e926", "00cf239d3fdd3c0b", "8fe70805d62442fe", "acdd1ddf99d6466e"], "e3dc8b7767957357": ["6b2065ed53799d5d", "be6b689037cfd3f5", "9696f2942cbcefbf", "27539ade77e10aa5", "76840deb548f6613", "bb0e47ca7d64c3a2", "f09ce6171ec29ca4", "8b39040d3506de46", "699ec9c1516b9078"], "22cdbb71f4bf1575": ["2f56f53dd8d0ca61", "b8ade73e2d5d467f", "a900f9c64ad40b8c", "8f6a3bfa88668fda", "b66f51090366d95e", "701e78c42935ac92", "a96b08dadeee4ed1"], "ad982e91da077a15": ["ffe98e727478892d", "05094b9aa362a52d"], "f6ee4dbcc8c3594d": ["272fc3de5af4b271", "206d4facf8046312", "b88bcdbd0c134780"], "f5a9189542062792": ["8fe70805d62442fe"], "0772f4f3d26b8789": ["948493e60578ae31", "09d7289e764aa019", "e90e15bb3ebc3b46", "d02c0c0e7726d3f1", "d62471d8c2e91d44"], "76df204594c6c6fc": ["09d7289e764aa019", "8fe70805d62442fe"], "4f19ac3696aba26d": ["b595cc5c5dbc9f27", "310090e8deaa9df2"], "b3bac256cd6b3f60": ["45aa5067f79407c7", "f8d71c90965bef12", "b782f37083fcb21e", "322cec6ce3bc2db5", "ebfc6922e8102c64"], "5f91796791f7b0d0": ["ae903905b511ec19", "250d9f3642fd0543", "b595cc5c5dbc9f27", "986d05c4d03adaf6", "065ba63a18a9cf96", "2a8fb4bebe84aa01", "eae30d306871e586", "84799c7a2e037df3", "8903da6c47c5b405"], "cd050aec27af9add": ["dd260601f883f45b", "bae673da8da75f34", "5d47659c01a5bfe0", "f9491e067e6094a3", "8ef745e21a875f72", "5bce0cd172cf9389", "496bd95e14f270d0", "346996767a29a82b"], "e640d430651812eb": ["cf3cd7d2b4d5ff70", "bae673da8da75f34", "4608f9da19534fdf", "d5000b0072dd6b2e", "0e97e28b90683458", "dfa8525adddea21c", "0c36a78449184e85", "1a39e93b2f96d97a", "13f2eb73585baa18"], "51a1661d308c3929": ["3159a3e3e182afdb", "43f1fa06d0fb0bd7", "b88bcdbd0c134780"], "5d3f888365936987": ["4c8642012c1b3a70", "5e1dde48ef03f452", "0969f1b2971aa784", "d9262d20fe39c685", "5d79062899f094ab"], "2fdeb829268a59b3": ["c8c2ed4483d9e387", "6a877c5c22acd65d", "49f9040d6c533349", "2e8b38e56500a7a3"], "ba847fbe086d51b1": ["9bd3d8195311b86a", "be6b689037cfd3f5", "9696f2942cbcefbf", "bb0e47ca7d64c3a2", "9b44aac7b71f3b81", "699ec9c1516b9078"], "7c5467d53d80fcdd": ["2f56f53dd8d0ca61", "b8ade73e2d5d467f", "a900f9c64ad40b8c", "8f6a3bfa88668fda", "b66f51090366d95e", "701e78c42935ac92", "a96b08dadeee4ed1"], "000ecf882de345cd": ["86fd6e5fe8e9acc5", "ffe98e727478892d", "05094b9aa362a52d"], "070c0b50ef0891a7": ["272fc3de5af4b271", "206d4facf8046312", "b88bcdbd0c134780"], "2a282e6e2664aecf": ["938619211ac368a8"], "d1c5ea058e826417": ["6a877c5c22acd65d"], "d44febd45afddc60": ["f720170162ea75a3", "6a877c5c22acd65d", "eac8143bbdabd053", "c0b73e547fb01a0d"], "b5afffb06306c471": ["5f2a79a8d81b577c", "eccdae69f659f373", "1932d4efc6aed553", "ca36425b9fea1047", "730864a279fa38ed"], "b2a06019710d6478": ["948493e60578ae31", "09d7289e764aa019", "e90e15bb3ebc3b46", "d02c0c0e7726d3f1", "d62471d8c2e91d44"], "239cfef495d5fcb0": ["c13c1778434d4401"], "e552699b03b077fe": ["6ad061c8308163a4", "09d7289e764aa019"], "219dd7d878ce9601": ["b595cc5c5dbc9f27", "310090e8deaa9df2"], "40cabbdcd801245c": ["f8d71c90965bef12", "ebfc6922e8102c64"], "afedd4c94bd030de": ["43c51e4e3af97e3b", "9d185bb8bfa967bc"], "c8a70c290d054a7f": ["d9e6447f15698416", "58293efde3d65766", "6ee638ce41216594", "5c3c6ed4d533a2c1", "581d47447ce9c3f4", "48c60bc68fad6568", "212a4cbee48972fb", "319103bce9ac4ecb", "c9fbd3e8cd885042", "b0c9f122b4dde681", "cef1dffeff86b2ad", "9911b886b4e40ea5"], "4f431acaf7aa3091": ["79f52309b208feba"], "c9333c7f250d9980": ["97cd6dc5749e6b61", "c0c976dc9f08ee09", "97722021b1a2792b", "0fb142c0536723a5", "4edfea8258d94845"], "1f7deed0ad1b97b8": ["60d10086ca90e035", "cf56ebddf5a35607", "9046fecf16ca5776", "c41170818cb6c11a", "7d7502050cc53cdd", "d82a8d307fd8eb65"], "2116032ea505ca64": ["16cefbcfc3301ca1"], "bbb2d6d6809f3ad9": ["352c95a60b376529"], "ebbeae105f902e16": ["70fbedafe4c03090", "352c95a60b376529"], "43d1da5288d7920a": ["70fbedafe4c03090", "7da8314b0a04d391", "6e95386ba46783ae", "dd9e68038557a4d1"], "b8c1123eb432e346": ["6e95386ba46783ae"], "5834b4ab5eb62ddc": ["6e95386ba46783ae"], "6c9896a73eb9dadb": ["00cd6ec8a2ea3f44", "6e95386ba46783ae"], "fb83621a8920b5de": ["6e95386ba46783ae"], "d624229769e51fdb": ["79f52309b208feba"], "a9d561875549b1d7": ["37948bdfda119408"], "d45705c9d6499369": ["905eb46730ee5067"], "c1f0d26d3293af30": ["86eced7f7b77fe6a"], "9f9129eb9ea4105d": ["86eced7f7b77fe6a"], "d2f99ab8996bbca3": ["86eced7f7b77fe6a"], "daf8c3052517c0c4": ["b18f2b51c06200ef", "79f52309b208feba"], "20120b0b0d706b71": ["0e25ef31b4529625", "355b77129964e5ba", "48dc9a385ba4fbd8", "6b557e862a4e7a83"], "0eb62e34341fc885": ["096b111a255bad7f"], "e40c978ac81c8e56": ["c2cb6858473acfca", "5af4e9f79989b6cc", "7a34d4c3a4e2a5d9"], "96cc9792297b04c8": ["50c3bc64c15d6a7b", "ce706d1201dd22bb", "6f9322cda2097382", "07624d428730f41a", "fb8e867f51196d1e"], "ff3d29a7c347f81f": ["16a58157ceda2854", "0f8293abf49fb235", "d3934aa41810502a", "319103bce9ac4ecb", "cef1dffeff86b2ad", "991851bc4eb66ce7"], "3fa3e2126481683a": ["d9e6447f15698416", "01cb9d90b7f7b0fb", "7f6f05d5d2de017e", "ca41d203d504603f", "0c32e3788d1919fe", "319103bce9ac4ecb", "b0c9f122b4dde681", "cef1dffeff86b2ad", "212a4cbee48972fb"], "256daeeb4709366c": ["ed929af314e5c217", "60102cdf4d854f34"], "193f6c0d3c4a1271": ["60102cdf4d854f34"], "8a9f606791f188a9": ["4f1d66c8ddf1b649"], "12639a6d0134394d": ["cb0eaa459b89d023"], "83c09122974e4dd1": ["1d00762131f7927c"], "9749c0c98e2d3561": ["b8a44fca7a25d75e"], "0b6a89b240bf5698": ["37948bdfda119408"], "7309e0bd987b1588": ["7d417c302db221f8"], "4c7146324ce73435": ["50eaaf0454fed8de"], "9e384c927f76dbf0": ["e875322dc2569a22", "355b77129964e5ba", "79f52309b208feba", "970141d598e2b973", "ed1e551c2e2c23f4", "8ea18a7337f8bebc"], "4377f6786a5a41fd": ["69a63c2f8c3447d6"], "86f1ca29b96afbf4": ["784c19bb97840e78", "31746da19135eb78", "6b557e862a4e7a83", "79f52309b208feba"], "d6b188f4e2232a2c": ["782aee541767578b", "ce706d1201dd22bb"], "34983f681748b1c6": ["8ae61549fb9dbd7f", "e9177cbaa806b0f2", "8e0e8ac292079e8f", "ec472be46def442f", "3abc3cf6dfcc60ab", "a156e5e34505bff7"], "e387576a2920a178": ["c6ff2def708b661a"], "80f53cdccfc87c4e": ["a0c1c19f89128b5b", "89a8294091d81ce8"], "04f251f674cdea32": ["b5bd38732575124b", "baaee1ebf82f1ca0"], "2a3122d2a379c1ce": ["8a79109dedafded4", "5bd6e1530b6209b9", "34fb96ce85d84537"], "3ce4e4972c2988b2": ["5bd6e1530b6209b9"], "51c002d66b88ec4f": ["48c60bc68fad6568", "16a58157ceda2854", "d3934aa41810502a", "7b046ef3d57edb7a", "319103bce9ac4ecb", "991851bc4eb66ce7"], "0e0658e7bdcc2a7c": ["d9e6447f15698416", "01cb9d90b7f7b0fb", "7f6f05d5d2de017e", "ca41d203d504603f", "0c32e3788d1919fe", "319103bce9ac4ecb", "b0c9f122b4dde681", "cef1dffeff86b2ad", "212a4cbee48972fb"], "77436f64ff77291a": ["ed929af314e5c217", "60102cdf4d854f34"], "4a5e38c24938d7da": ["60102cdf4d854f34"], "e2fbd3aa81f89018": ["4f1d66c8ddf1b649"], "004de6bc3f3b2ffb": ["cb0eaa459b89d023"], "0facfaf0f9bc4723": ["1d00762131f7927c"], "6c9a5cda2d241944": ["b8a44fca7a25d75e"], "625124b98bff96e2": ["37948bdfda119408"], "06f02e5985dacd62": ["7d417c302db221f8"], "47cac461812d84b6": ["50eaaf0454fed8de"], "5a49d50c4b784809": ["e875322dc2569a22", "355b77129964e5ba", "79f52309b208feba", "970141d598e2b973", "ed1e551c2e2c23f4", "8ea18a7337f8bebc"], "b123ad063fd6e616": ["69a63c2f8c3447d6"], "99190c180bdba5e1": ["dd19c79674faa7bd", "355b77129964e5ba", "50b9cb80c26211e4", "6b97bbdb92030366", "72c3ce0a8eee26de"], "3aa93846fb2a2d54": ["784c19bb97840e78", "31746da19135eb78", "6b557e862a4e7a83", "79f52309b208feba"], "a5282eb098a9de3b": ["782aee541767578b", "ce706d1201dd22bb"], "54fd7d05fdccb9d3": ["8ae61549fb9dbd7f", "e9177cbaa806b0f2", "8e0e8ac292079e8f", "ec472be46def442f", "3abc3cf6dfcc60ab", "a156e5e34505bff7"], "e935c846d8eafba4": ["c6ff2def708b661a"], "42aff4b4c6cfc199": ["a0c1c19f89128b5b", "89a8294091d81ce8"], "448e666ba8fe40f9": ["b5bd38732575124b", "baaee1ebf82f1ca0"], "0bef74a36daf31d1": ["8a79109dedafded4", "5bd6e1530b6209b9", "34fb96ce85d84537"], "f6efa28179d67e17": ["5bd6e1530b6209b9"], "9f7180e50738c510": ["d9e6447f15698416", "58293efde3d65766", "e3c29c94fe5f788a", "f58e8e6f3ea6435a", "5c3c6ed4d533a2c1", "0f8293abf49fb235", "d3934aa41810502a", "d06b53ab82edfc5c", "b0c9f122b4dde681", "319103bce9ac4ecb", "cef1dffeff86b2ad", "212a4cbee48972fb"], "b99ad375f679e170": ["5e1dde48ef03f452"], "c8ad92dd666d164b": ["0968094acb78c1ad"], "8d359804040b7767": ["245ea398e76f2b9d", "b5dc06eb5bd93776"], "ec4e92484f0a5a4e": ["3661c2e944d40b82"], "cb820b8eef70d979": ["189fbf0e885b2cd2"], "c8b5f45e988bfbdc": ["7da8314b0a04d391"], "746f49fefb9761f2": ["7195222d09304114", "3661c2e944d40b82"], "1f4d0744f6c2aee8": ["b39b76920d6df157"], "ec194f577de382a2": ["37948bdfda119408", "79f52309b208feba"], "799a59af806e688c": ["5187796566007fb7"], "11d0b9f27311fcf0": ["0de8a786de8a293a", "c73573bcb8a98942", "7d417c302db221f8"], "b58de4bac0bacaa4": ["b18f2b51c06200ef", "79f52309b208feba"], "b16452da8d4c69e0": ["6b557e862a4e7a83"], "531476c443d08c99": ["98967461c0d03924", "8b2212d720e7654c"], "e46db54fa020b2a4": ["3482d2ca636c03f4", "dfe899f01540e66e"], "c45c28112723c377": ["ee26938da00b173f", "a8ce5d63880f52fa"], "4fa8387b8db3832a": ["4f6771b02873d1a8", "ee872eca29098193"], "618e1da5930c5ace": ["4f6771b02873d1a8", "e9177cbaa806b0f2"], "2319b160a36f1033": ["b96397144e6ee4c1", "604e588ff8b579b8", "89b95069493fa022", "e27b59e793c2c29d", "09d7289e764aa019", "0fedc5641985ff4f", "3e8c1df8b14f7b40", "7aedef9d8d8558b6"]}
|
data/prebuilt/corpus/chunks.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/prebuilt/corpus/documents.jsonl
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"doc_id": "f33f68f0f3885639", "filename": "F3 (122) - Комп'ютерні науки (2025-2026).pdf", "doc_type": "unknown", "title": "MIHICTEPCTBO OCBITII I HA)TKI,I YKPAIHI4", "page_count": 19, "processed_at": "2026-02-09T16:47:57.525988"}
|
| 2 |
+
{"doc_id": "b66612a925339457", "filename": "F7 (123) - Комп'ютерна інженерія (2025-2026).pdf", "doc_type": "unknown", "title": "MIHICTEPCTBO OCBITI4 I HAYKII YKPAIITTT", "page_count": 16, "processed_at": "2026-02-09T16:48:14.689215"}
|
| 3 |
+
{"doc_id": "59990e00fb6dc958", "filename": "G7 (174) - Автоматизація, комп'ютерно-інтегровані технології та робототехніка (2025-2026).pdf", "doc_type": "unknown", "title": "MIHICTEPCTBO ocBITrI I HAyKI,I yKpAiuI,t", "page_count": 16, "processed_at": "2026-02-09T16:48:18.445786"}
|
| 4 |
+
{"doc_id": "cf69446fcc226f76", "filename": "Перелік документів для вступу.pdf", "doc_type": "admission_rules", "title": "Перелік документів для вступу", "page_count": 1, "processed_at": "2026-02-09T16:49:07.945521"}
|
| 5 |
+
{"doc_id": "906125f79a596b75", "filename": "РП_122_Бази Даних_25_26_Коробчинський (семестр 2).pdf", "doc_type": "educational_program", "title": "Міністерство освіти і науки України", "page_count": 13, "processed_at": "2026-02-09T16:55:48.891049"}
|
| 6 |
+
{"doc_id": "d782275144689a32", "filename": "РП_122_Основи_машинного_навчання_25-26.pdf", "doc_type": "educational_program", "title": "Міністерство освіти і науки України", "page_count": 15, "processed_at": "2026-02-09T16:55:54.632512"}
|
| 7 |
+
{"doc_id": "29d505a0536ac998", "filename": "РП_122_Основи_машинного_навчання_25-26_05-12-2025.pdf", "doc_type": "educational_program", "title": "Міністерство освіти і науки України", "page_count": 15, "processed_at": "2026-02-09T16:55:58.963442"}
|
| 8 |
+
{"doc_id": "d5bf12aea01ab74a", "filename": "РПНД_Візуалізація даних 122 Ковальчук_25-26.pdf", "doc_type": "educational_program", "title": "Міністерство освіти і науки України", "page_count": 14, "processed_at": "2026-02-09T16:57:06.055198"}
|
| 9 |
+
{"doc_id": "d717d12183a88d4b", "filename": "СКЛАД приймальної комісії'.pdf", "doc_type": "admission_rules", "title": "МІНІСТЕРСТВО ОСВІТИ УКРАЇНИ", "page_count": 3, "processed_at": "2026-02-09T16:57:06.588367"}
|
| 10 |
+
{"doc_id": "f33f68f0f3885639", "filename": "F3 (122) - Комп'ютерні науки (2025-2026).pdf", "doc_type": "unknown", "title": "MIHICTEPCTBO OCBITII I HA)TKI,I YKPAIHI4", "page_count": 19, "processed_at": "2026-02-09T16:57:11.473169"}
|
| 11 |
+
{"doc_id": "b66612a925339457", "filename": "F7 (123) - Комп'ютерна інженерія (2025-2026).pdf", "doc_type": "unknown", "title": "MIHICTEPCTBO OCBITI4 I HAYKII YKPAIITTT", "page_count": 16, "processed_at": "2026-02-09T16:57:38.889506"}
|
| 12 |
+
{"doc_id": "59990e00fb6dc958", "filename": "G7 (174) - Автоматизація, комп'ютерно-інтегровані технології та робототехніка (2025-2026).pdf", "doc_type": "unknown", "title": "MIHICTEPCTBO ocBITrI I HAyKI,I yKpAiuI,t", "page_count": 16, "processed_at": "2026-02-09T16:57:49.722652"}
|
| 13 |
+
{"doc_id": "cf69446fcc226f76", "filename": "Перелік документів для вступу.pdf", "doc_type": "admission_rules", "title": "Перелік документів для вступу", "page_count": 1, "processed_at": "2026-02-09T16:58:53.678951"}
|
| 14 |
+
{"doc_id": "906125f79a596b75", "filename": "РП_122_Бази Даних_25_26_Коробчинський (семестр 2).pdf", "doc_type": "educational_program", "title": "Міністерство освіти і науки України", "page_count": 13, "processed_at": "2026-02-09T17:00:12.377405"}
|
| 15 |
+
{"doc_id": "d782275144689a32", "filename": "РП_122_Основи_машинного_навчання_25-26.pdf", "doc_type": "educational_program", "title": "Міністерство освіти і науки України", "page_count": 15, "processed_at": "2026-02-09T17:00:16.416400"}
|
| 16 |
+
{"doc_id": "29d505a0536ac998", "filename": "РП_122_Основи_машинного_навчання_25-26_05-12-2025.pdf", "doc_type": "educational_program", "title": "Міністерство освіти і науки України", "page_count": 15, "processed_at": "2026-02-09T17:00:20.013005"}
|
| 17 |
+
{"doc_id": "d5bf12aea01ab74a", "filename": "РПНД_Візуалізація даних 122 Ковальчук_25-26.pdf", "doc_type": "educational_program", "title": "Міністерство освіти і науки України", "page_count": 14, "processed_at": "2026-02-09T17:01:29.021579"}
|
| 18 |
+
{"doc_id": "d717d12183a88d4b", "filename": "СКЛАД приймальної комісії'.pdf", "doc_type": "admission_rules", "title": "МІНІСТЕРСТВО ОСВІТ�� УКРАЇНИ", "page_count": 3, "processed_at": "2026-02-09T17:01:29.537591"}
|
data/prebuilt/corpus/entities.jsonl
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"entity_id": "7b046ef3d57edb7a", "entity_type": "DATE", "value": "2025", "normalized": "2025", "source_id": "c0c2349542f40602", "confidence": 0.6256306171417236, "doc_ids": ["29d505a0536ac998", "f33f68f0f3885639"], "embedding": null}
|
| 2 |
+
{"entity_id": "2396ca191bb74a2f", "entity_type": "DATE", "value": "10 липня 2019 року", "normalized": "10 липня 2019 року", "source_id": "55caa1ade06bdf64", "confidence": 0.7426362633705139, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 3 |
+
{"entity_id": "5bce0cd172cf9389", "entity_type": "DATE", "value": "23 листопада 2011", "normalized": "23 листопада 2011", "source_id": "55caa1ade06bdf64", "confidence": 0.85, "doc_ids": ["b66612a925339457", "59990e00fb6dc958", "f33f68f0f3885639"], "embedding": null}
|
| 4 |
+
{"entity_id": "dd260601f883f45b", "entity_type": "DATE", "value": "25 червня 2020", "normalized": "25 червня 2020", "source_id": "55caa1ade06bdf64", "confidence": 0.85, "doc_ids": ["b66612a925339457", "59990e00fb6dc958", "f33f68f0f3885639"], "embedding": null}
|
| 5 |
+
{"entity_id": "d5000b0072dd6b2e", "entity_type": "DATE", "value": "2029 року", "normalized": "2029 року", "source_id": "ff9c88b3a7fb7942", "confidence": 0.8291820883750916, "doc_ids": ["b66612a925339457", "59990e00fb6dc958", "f33f68f0f3885639"], "embedding": null}
|
| 6 |
+
{"entity_id": "1a39e93b2f96d97a", "entity_type": "DATE", "value": "21.05.2021", "normalized": "21.05.2021", "source_id": "e640d430651812eb", "confidence": 0.6158963441848755, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 7 |
+
{"entity_id": "9911b886b4e40ea5", "entity_type": "DATE", "value": "08 вересня 2025 року", "normalized": "08 вересня 2025 року", "source_id": "c8a70c290d054a7f", "confidence": 0.5278382897377014, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 8 |
+
{"entity_id": "c2cb6858473acfca", "entity_type": "DATE", "value": "2012", "normalized": "2012", "source_id": "e40c978ac81c8e56", "confidence": 0.6293627023696899, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 9 |
+
{"entity_id": "ce706d1201dd22bb", "entity_type": "DATE", "value": "2019", "normalized": "2019", "source_id": "d6b188f4e2232a2c", "confidence": 0.8548734188079834, "doc_ids": ["29d505a0536ac998", "906125f79a596b75", "d782275144689a32"], "embedding": null}
|
| 10 |
+
{"entity_id": "0c32e3788d1919fe", "entity_type": "DATE", "value": "23 вересня 2024 року", "normalized": "23 вересня 2024 року", "source_id": "3fa3e2126481683a", "confidence": 0.6584115028381348, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 11 |
+
{"entity_id": "8e0e8ac292079e8f", "entity_type": "DATE", "value": "1993", "normalized": "1993", "source_id": "34983f681748b1c6", "confidence": 0.8399044871330261, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 12 |
+
{"entity_id": "3abc3cf6dfcc60ab", "entity_type": "DATE", "value": "1994", "normalized": "1994", "source_id": "54fd7d05fdccb9d3", "confidence": 0.8178855776786804, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 13 |
+
{"entity_id": "c6ff2def708b661a", "entity_type": "DATE", "value": "2016", "normalized": "2016", "source_id": "e935c846d8eafba4", "confidence": 0.7019931674003601, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 14 |
+
{"entity_id": "f58e8e6f3ea6435a", "entity_type": "DATE", "value": "8 вересня 2024 року", "normalized": "8 вересня 2024 року", "source_id": "9f7180e50738c510", "confidence": 0.5250833630561829, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 15 |
+
{"entity_id": "98967461c0d03924", "entity_type": "DATE", "value": "2021", "normalized": "2021", "source_id": "531476c443d08c99", "confidence": 0.8760828375816345, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 16 |
+
{"entity_id": "dfe899f01540e66e", "entity_type": "DATE", "value": "2023", "normalized": "2023", "source_id": "e46db54fa020b2a4", "confidence": 0.9276866912841797, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 17 |
+
{"entity_id": "a8ce5d63880f52fa", "entity_type": "DATE", "value": "2024", "normalized": "2024", "source_id": "c45c28112723c377", "confidence": 0.8920702934265137, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 18 |
+
{"entity_id": "58293efde3d65766", "entity_type": "PERSON", "value": "БАБЕНКО Віталіна Олексіївна", "normalized": "бабенко віталіна олексіївна", "source_id": "c0c2349542f40602", "confidence": 0.8759607076644897, "doc_ids": ["d5bf12aea01ab74a", "906125f79a596b75", "f33f68f0f3885639"], "embedding": null}
|
| 19 |
+
{"entity_id": "bfa00e8db74c2bff", "entity_type": "PERSON", "value": "ХРУСЛОВ Максим Михайлович", "normalized": "хруслов максим михайлович", "source_id": "aec957bff3333594", "confidence": 0.8095629811286926, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 20 |
+
{"entity_id": "aa046696ad983530", "entity_type": "PERSON", "value": "СТРУКОВ Володимир Михайлович", "normalized": "струков володимир михайлович", "source_id": "aec957bff3333594", "confidence": 0.8258512616157532, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 21 |
+
{"entity_id": "c2f2ef49af307713", "entity_type": "PERSON", "value": "МАЦИЙ Ольга Борисівна", "normalized": "маций ольга борисівна", "source_id": "aec957bff3333594", "confidence": 0.8873525857925415, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 22 |
+
{"entity_id": "b972a394b89fc296", "entity_type": "PERSON", "value": "ЖИМЕРЕНКО Степан Ігорович", "normalized": "жимеренко степан ігорович", "source_id": "aec957bff3333594", "confidence": 0.8892180919647217, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 23 |
+
{"entity_id": "250d9f3642fd0543", "entity_type": "PERSON", "value": "МІХЄЄВ Іван Андрійович", "normalized": "міхєєв іван андрійович", "source_id": "5f91796791f7b0d0", "confidence": 0.8698946237564087, "doc_ids": ["b66612a925339457", "59990e00fb6dc958", "f33f68f0f3885639"], "embedding": null}
|
| 24 |
+
{"entity_id": "37948bdfda119408", "entity_type": "PERSON", "value": "Каразіна", "normalized": "каразіна", "source_id": "ec194f577de382a2", "confidence": 0.7292633056640625, "doc_ids": ["29d505a0536ac998", "d782275144689a32", "d5bf12aea01ab74a", "906125f79a596b75", "f33f68f0f3885639"], "embedding": null}
|
| 25 |
+
{"entity_id": "9ae9a1669500ffa1", "entity_type": "PERSON", "value": "особи", "normalized": "особи", "source_id": "ff9c88b3a7fb7942", "confidence": 0.6308590173721313, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 26 |
+
{"entity_id": "4e1c9e09669d9739", "entity_type": "PERSON", "value": "фахівців", "normalized": "фахівців", "source_id": "bef65c47c7457b9b", "confidence": 0.5742590427398682, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 27 |
+
{"entity_id": "79f52309b208feba", "entity_type": "PERSON", "value": "студент", "normalized": "студент", "source_id": "3aa93846fb2a2d54", "confidence": 0.9856151342391968, "doc_ids": ["29d505a0536ac998", "d782275144689a32", "d5bf12aea01ab74a", "906125f79a596b75", "f33f68f0f3885639"], "embedding": null}
|
| 28 |
+
{"entity_id": "eae30d306871e586", "entity_type": "PERSON", "value": "Стрілець Вікторія Євгенівна", "normalized": "стрілець вікторія євгенівна", "source_id": "5f91796791f7b0d0", "confidence": 0.8760467767715454, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 29 |
+
{"entity_id": "b445c482d619deb7", "entity_type": "PERSON", "value": "Мірошник Марина Анатоліївна", "normalized": "мірошник марина анатоліївна", "source_id": "d42293ae57732392", "confidence": 0.8231627345085144, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 30 |
+
{"entity_id": "2c8678efa06f67f9", "entity_type": "PERSON", "value": "Рева Сергій Миколайович", "normalized": "рева сергій миколайович", "source_id": "d42293ae57732392", "confidence": 0.8262960314750671, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 31 |
+
{"entity_id": "6dbca2dfad017f88", "entity_type": "PERSON", "value": "Малахова Марина Олегівна", "normalized": "малахова марина олегівна", "source_id": "d42293ae57732392", "confidence": 0.839874804019928, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 32 |
+
{"entity_id": "01c24beb989b4039", "entity_type": "PERSON", "value": "Бикова Тетяна Володимирівна", "normalized": "бикова тетяна володимирівна", "source_id": "d42293ae57732392", "confidence": 0.8429558873176575, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 33 |
+
{"entity_id": "ceae3c879404fdc2", "entity_type": "PERSON", "value": "КОВИЛКІНА Катерина Олександрівна", "normalized": "ковилкіна катерина олександрівна", "source_id": "d42293ae57732392", "confidence": 0.856283962726593, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 34 |
+
{"entity_id": "5e1dde48ef03f452", "entity_type": "PERSON", "value": "здобувач", "normalized": "здобувач", "source_id": "5d3f888365936987", "confidence": 0.9464806914329529, "doc_ids": ["b66612a925339457", "59990e00fb6dc958", "d5bf12aea01ab74a"], "embedding": null}
|
| 35 |
+
{"entity_id": "8b39040d3506de46", "entity_type": "PERSON", "value": "Техніки-програмісти", "normalized": "техніки-програмісти", "source_id": "e3dc8b7767957357", "confidence": 0.540186882019043, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 36 |
+
{"entity_id": "2f56f53dd8d0ca61", "entity_type": "PERSON", "value": "Бакалавр", "normalized": "бакалавр", "source_id": "7c5467d53d80fcdd", "confidence": 0.8135822415351868, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 37 |
+
{"entity_id": "05094b9aa362a52d", "entity_type": "PERSON", "value": "професорами", "normalized": "професорами", "source_id": "ad982e91da077a15", "confidence": 0.6813598275184631, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 38 |
+
{"entity_id": "d02c0c0e7726d3f1", "entity_type": "PERSON", "value": "Усі науково-педагогічні працівники", "normalized": "усі науково-педагогічні працівники", "source_id": "0772f4f3d26b8789", "confidence": 0.6251263618469238, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 39 |
+
{"entity_id": "8903da6c47c5b405", "entity_type": "PERSON", "value": "Толстолузька Олена Геннадіївна", "normalized": "толстолузька олена геннадіївна", "source_id": "5f91796791f7b0d0", "confidence": 0.8661749362945557, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 40 |
+
{"entity_id": "2a8fb4bebe84aa01", "entity_type": "PERSON", "value": "Шматков Сергій Ігорович", "normalized": "шматков сергій ігорович", "source_id": "5f91796791f7b0d0", "confidence": 0.8339876532554626, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 41 |
+
{"entity_id": "84799c7a2e037df3", "entity_type": "PERSON", "value": "Бакуменко Ніна Станіславівна", "normalized": "бакуменко ніна станіславівна", "source_id": "5f91796791f7b0d0", "confidence": 0.8757576942443848, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 42 |
+
{"entity_id": "065ba63a18a9cf96", "entity_type": "PERSON", "value": "Булавін Дмитро Олексійович", "normalized": "булавін дмитро олексійович", "source_id": "5f91796791f7b0d0", "confidence": 0.8574802279472351, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 43 |
+
{"entity_id": "ae903905b511ec19", "entity_type": "PERSON", "value": "СОЛЯНИК Юрій Вячеславович", "normalized": "соляник юрій вячеславович", "source_id": "5f91796791f7b0d0", "confidence": 0.8664460182189941, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 44 |
+
{"entity_id": "49f9040d6c533349", "entity_type": "PERSON", "value": "висококваліфікованих кадрів", "normalized": "висококваліфікованих кадрів", "source_id": "2fdeb829268a59b3", "confidence": 0.6788517236709595, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 45 |
+
{"entity_id": "c9fbd3e8cd885042", "entity_type": "PERSON", "value": "Коробчинський Кирил Петрович", "normalized": "коробчинський кирил петрович", "source_id": "c8a70c290d054a7f", "confidence": 0.7622587084770203, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 46 |
+
{"entity_id": "d9e6447f15698416", "entity_type": "PERSON", "value": "Володимир СТРУКОВ", "normalized": "володимир струков", "source_id": "3fa3e2126481683a", "confidence": 0.9238550066947937, "doc_ids": ["29d505a0536ac998", "906125f79a596b75", "d782275144689a32", "d5bf12aea01ab74a"], "embedding": null}
|
| 47 |
+
{"entity_id": "212a4cbee48972fb", "entity_type": "PERSON", "value": "Євгеній ПОКЛОНСЬКИЙ", "normalized": "євгеній поклонський", "source_id": "3fa3e2126481683a", "confidence": 0.902168869972229, "doc_ids": ["29d505a0536ac998", "906125f79a596b75", "d782275144689a32", "d5bf12aea01ab74a"], "embedding": null}
|
| 48 |
+
{"entity_id": "16cefbcfc3301ca1", "entity_type": "PERSON", "value": "користувачі", "normalized": "користувачі", "source_id": "2116032ea505ca64", "confidence": 0.8406650424003601, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 49 |
+
{"entity_id": "096b111a255bad7f", "entity_type": "PERSON", "value": "К. Коробчинський", "normalized": "к. коробчинський", "source_id": "0eb62e34341fc885", "confidence": 0.9662481546401978, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 50 |
+
{"entity_id": "7a34d4c3a4e2a5d9", "entity_type": "PERSON", "value": "А. Пасічник", "normalized": "а. пасічник", "source_id": "e40c978ac81c8e56", "confidence": 0.9840690493583679, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 51 |
+
{"entity_id": "6f9322cda2097382", "entity_type": "PERSON", "value": "О.Г.Трофименко", "normalized": "о.г.трофименко", "source_id": "96cc9792297b04c8", "confidence": 0.9673763513565063, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 52 |
+
{"entity_id": "50c3bc64c15d6a7b", "entity_type": "PERSON", "value": "Ю.В.Прокоп", "normalized": "ю.в.прокоп", "source_id": "96cc9792297b04c8", "confidence": 0.9656534194946289, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 53 |
+
{"entity_id": "07624d428730f41a", "entity_type": "PERSON", "value": "Н.І. Копитчук", "normalized": "н.і. копитчук", "source_id": "96cc9792297b04c8", "confidence": 0.9630191922187805, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 54 |
+
{"entity_id": "16a58157ceda2854", "entity_type": "PERSON", "value": "ДОНЕЦЬ Володимир Віталійович", "normalized": "донець володимир віталійович", "source_id": "51c002d66b88ec4f", "confidence": 0.8581857681274414, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 55 |
+
{"entity_id": "01cb9d90b7f7b0fb", "entity_type": "PERSON", "value": "МІШИН Олександр Вікторович", "normalized": "мішин олександр вікторович", "source_id": "3fa3e2126481683a", "confidence": 0.9005902409553528, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 56 |
+
{"entity_id": "ca41d203d504603f", "entity_type": "PERSON", "value": "Оксана ПОДОЛЯКА", "normalized": "оксана подоляка", "source_id": "3fa3e2126481683a", "confidence": 0.9149578213691711, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 57 |
+
{"entity_id": "4f1d66c8ddf1b649", "entity_type": "PERSON", "value": "учителем", "normalized": "учителем", "source_id": "8a9f606791f188a9", "confidence": 0.5263881087303162, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 58 |
+
{"entity_id": "ec472be46def442f", "entity_type": "PERSON", "value": "Nikos Drakos", "normalized": "nikos drakos", "source_id": "34983f681748b1c6", "confidence": 0.9861912727355957, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 59 |
+
{"entity_id": "e3c29c94fe5f788a", "entity_type": "PERSON", "value": "КОВАЛЬЧУК Дмитро Миколайович", "normalized": "ковальчук дмитро миколайович", "source_id": "9f7180e50738c510", "confidence": 0.8737912178039551, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 60 |
+
{"entity_id": "b96397144e6ee4c1", "entity_type": "PERSON", "value": "Денис КОВАЛЕНКО", "normalized": "денис коваленко", "source_id": "2319b160a36f1033", "confidence": 0.8962492346763611, "doc_ids": ["d717d12183a88d4b"], "embedding": null}
|
| 61 |
+
{"entity_id": "3e8c1df8b14f7b40", "entity_type": "PERSON", "value": "Ольга ПЄШКОВА", "normalized": "ольга пєшкова", "source_id": "2319b160a36f1033", "confidence": 0.8972609043121338, "doc_ids": ["d717d12183a88d4b"], "embedding": null}
|
| 62 |
+
{"entity_id": "89b95069493fa022", "entity_type": "PERSON", "value": "Олександр КОЗЛОВ", "normalized": "олександр козлов", "source_id": "2319b160a36f1033", "confidence": 0.904872715473175, "doc_ids": ["d717d12183a88d4b"], "embedding": null}
|
| 63 |
+
{"entity_id": "604e588ff8b579b8", "entity_type": "PERSON", "value": "Олександр ПЕЛЮХ", "normalized": "олександр пелюх", "source_id": "2319b160a36f1033", "confidence": 0.9015970826148987, "doc_ids": ["d717d12183a88d4b"], "embedding": null}
|
| 64 |
+
{"entity_id": "e27b59e793c2c29d", "entity_type": "PERSON", "value": "Ганна ЗУБЕНКО", "normalized": "ганна зубенко", "source_id": "2319b160a36f1033", "confidence": 0.6442049145698547, "doc_ids": ["d717d12183a88d4b"], "embedding": null}
|
| 65 |
+
{"entity_id": "6ee638ce41216594", "entity_type": "ORGANIZATION", "value": "ННІ комп’ютерних наук та штучного інтелекту", "normalized": "нні комп'ютерних наук та штучного інтелекту", "source_id": "aec957bff3333594", "confidence": 0.8198742866516113, "doc_ids": ["906125f79a596b75", "f33f68f0f3885639"], "embedding": null}
|
| 66 |
+
{"entity_id": "d6caffc95d4ea230", "entity_type": "ORGANIZATION", "value": "Міністерства освіти і науки України", "normalized": "міністерства освіти і науки україни", "source_id": "55caa1ade06bdf64", "confidence": 0.9097161293029785, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 67 |
+
{"entity_id": "5d47659c01a5bfe0", "entity_type": "ORGANIZATION", "value": "Кабінету Міністрів України", "normalized": "кабінету міністрів україни", "source_id": "55caa1ade06bdf64", "confidence": 0.7949663400650024, "doc_ids": ["b66612a925339457", "59990e00fb6dc958", "f33f68f0f3885639"], "embedding": null}
|
| 68 |
+
{"entity_id": "c50aef3ee34c16b0", "entity_type": "ORGANIZATION", "value": "кафедра штучного інтелекту та програмного забезпечення", "normalized": "кафедра штучного інтелекту та програмного забезпечення", "source_id": "7d2fabee274a867b", "confidence": 0.795989990234375, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 69 |
+
{"entity_id": "290254ba61badf4d", "entity_type": "ORGANIZATION", "value": "кафедра моделювання систем і технологій", "normalized": "кафедра моделювання систем і технологій", "source_id": "7d2fabee274a867b", "confidence": 0.7922220826148987, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 70 |
+
{"entity_id": "4fed50ed9d1c3c43", "entity_type": "ORGANIZATION", "value": "кафедра електроніки та управляючих систем", "normalized": "кафедра електроніки та управляючих систем", "source_id": "7d2fabee274a867b", "confidence": 0.849952757358551, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 71 |
+
{"entity_id": "938619211ac368a8", "entity_type": "ORGANIZATION", "value": "суспільства", "normalized": "суспільства", "source_id": "2a282e6e2664aecf", "confidence": 0.5065808296203613, "doc_ids": ["59990e00fb6dc958", "f33f68f0f3885639"], "embedding": null}
|
| 72 |
+
{"entity_id": "cc66c883cdb05b6c", "entity_type": "ORGANIZATION", "value": "навчальним закладом", "normalized": "навчальним закладом", "source_id": "c0031f76c1b5be0a", "confidence": 0.556401789188385, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 73 |
+
{"entity_id": "986d05c4d03adaf6", "entity_type": "ORGANIZATION", "value": "ЕПАМ", "normalized": "епам", "source_id": "d42293ae57732392", "confidence": 0.7316806316375732, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 74 |
+
{"entity_id": "496bd95e14f270d0", "entity_type": "ORGANIZATION", "value": "МОН України", "normalized": "мон україни", "source_id": "f648911d33733b47", "confidence": 0.8786872029304504, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 75 |
+
{"entity_id": "d3934aa41810502a", "entity_type": "ORGANIZATION", "value": "Навчально-наукового інституту комп’ютерних наук та штучного інтелекту", "normalized": "навчально-наукового інституту комп'ютерних наук та штучного інтелекту", "source_id": "ff3d29a7c347f81f", "confidence": 0.7992377281188965, "doc_ids": ["b66612a925339457", "29d505a0536ac998", "d782275144689a32", "d5bf12aea01ab74a"], "embedding": null}
|
| 76 |
+
{"entity_id": "cf3cd7d2b4d5ff70", "entity_type": "ORGANIZATION", "value": "Національне агентство із забезпечення якості вищої освіти, Україна", "normalized": "національне агентство із забезпечення якості вищої освіти, україна", "source_id": "83fdca75c9a45216", "confidence": 0.8479772210121155, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 77 |
+
{"entity_id": "09d7289e764aa019", "entity_type": "ORGANIZATION", "value": "Харківського національного університету імені В. Каразіна", "normalized": "харківського національного університету імені в. каразіна", "source_id": "0772f4f3d26b8789", "confidence": 0.8443599343299866, "doc_ids": ["b66612a925339457", "d717d12183a88d4b", "59990e00fb6dc958"], "embedding": null}
|
| 78 |
+
{"entity_id": "d62471d8c2e91d44", "entity_type": "ORGANIZATION", "value": "іноземними вищими навчальними закладами — партнерами", "normalized": "іноземними вищими навчальними закладами — партнерами", "source_id": "0772f4f3d26b8789", "confidence": 0.779074490070343, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 79 |
+
{"entity_id": "310090e8deaa9df2", "entity_type": "ORGANIZATION", "value": "університету", "normalized": "університету", "source_id": "219dd7d878ce9601", "confidence": 0.7823936939239502, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 80 |
+
{"entity_id": "13f2eb73585baa18", "entity_type": "ORGANIZATION", "value": "Каразіна Навчально-науковий інститут комп’ютерних наук та штучного інтелекту", "normalized": "каразіна навчально-науковий інститут комп'ютерних наук та штучного інтелекту", "source_id": "e640d430651812eb", "confidence": 0.5123445987701416, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 81 |
+
{"entity_id": "9d185bb8bfa967bc", "entity_type": "ORGANIZATION", "value": "ДПА", "normalized": "дпа", "source_id": "afedd4c94bd030de", "confidence": 0.6006126999855042, "doc_ids": ["cf69446fcc226f76"], "embedding": null}
|
| 82 |
+
{"entity_id": "b0c9f122b4dde681", "entity_type": "ORGANIZATION", "value": "кафедри математичного моделювання та аналізу даних", "normalized": "кафедри математичного моделювання та аналізу даних", "source_id": "3fa3e2126481683a", "confidence": 0.6874576807022095, "doc_ids": ["29d505a0536ac998", "906125f79a596b75", "d782275144689a32", "d5bf12aea01ab74a"], "embedding": null}
|
| 83 |
+
{"entity_id": "7d7502050cc53cdd", "entity_type": "ORGANIZATION", "value": "СУБД", "normalized": "субд", "source_id": "1f7deed0ad1b97b8", "confidence": 0.5541012287139893, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 84 |
+
{"entity_id": "5af4e9f79989b6cc", "entity_type": "ORGANIZATION", "value": "Магнолія", "normalized": "магнолія", "source_id": "e40c978ac81c8e56", "confidence": 0.7429603934288025, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 85 |
+
{"entity_id": "fb8e867f51196d1e", "entity_type": "ORGANIZATION", "value": "Фенікс", "normalized": "фенікс", "source_id": "96cc9792297b04c8", "confidence": 0.7650951743125916, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 86 |
+
{"entity_id": "b8a44fca7a25d75e", "entity_type": "ORGANIZATION", "value": "Kaggle", "normalized": "kaggle", "source_id": "9749c0c98e2d3561", "confidence": 0.7928787469863892, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 87 |
+
{"entity_id": "782aee541767578b", "entity_type": "ORGANIZATION", "value": "Manning Publications", "normalized": "manning publications", "source_id": "d6b188f4e2232a2c", "confidence": 0.9487306475639343, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 88 |
+
{"entity_id": "a156e5e34505bff7", "entity_type": "ORGANIZATION", "value": "Computer Based Learning Unit", "normalized": "computer based learning unit", "source_id": "54fd7d05fdccb9d3", "confidence": 0.572193443775177, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 89 |
+
{"entity_id": "8ae61549fb9dbd7f", "entity_type": "ORGANIZATION", "value": "University of Leeds", "normalized": "university of leeds", "source_id": "34983f681748b1c6", "confidence": 0.9579901099205017, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 90 |
+
{"entity_id": "89a8294091d81ce8", "entity_type": "ORGANIZATION", "value": "coursera", "normalized": "coursera", "source_id": "80f53cdccfc87c4e", "confidence": 0.6431981325149536, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 91 |
+
{"entity_id": "baaee1ebf82f1ca0", "entity_type": "ORGANIZATION", "value": "learn.org", "normalized": "learn.org", "source_id": "448e666ba8fe40f9", "confidence": 0.6441490650177002, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 92 |
+
{"entity_id": "b5bd38732575124b", "entity_type": "ORGANIZATION", "value": "sklearn", "normalized": "sklearn", "source_id": "04f251f674cdea32", "confidence": 0.6875706315040588, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 93 |
+
{"entity_id": "34fb96ce85d84537", "entity_type": "ORGANIZATION", "value": "gitlab", "normalized": "gitlab", "source_id": "2a3122d2a379c1ce", "confidence": 0.680031955242157, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 94 |
+
{"entity_id": "5bd6e1530b6209b9", "entity_type": "ORGANIZATION", "value": "github", "normalized": "github", "source_id": "0bef74a36daf31d1", "confidence": 0.8893853425979614, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 95 |
+
{"entity_id": "8a79109dedafded4", "entity_type": "ORGANIZATION", "value": "bitbucket", "normalized": "bitbucket", "source_id": "2a3122d2a379c1ce", "confidence": 0.7396644949913025, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 96 |
+
{"entity_id": "8b2212d720e7654c", "entity_type": "ORGANIZATION", "value": "Діалектика", "normalized": "діалектика", "source_id": "531476c443d08c99", "confidence": 0.8255718946456909, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 97 |
+
{"entity_id": "3482d2ca636c03f4", "entity_type": "ORGANIZATION", "value": "Вид-во ЧНУ ім. Петра Могили", "normalized": "вид-во чну ім. петра могили", "source_id": "e46db54fa020b2a4", "confidence": 0.6376891136169434, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 98 |
+
{"entity_id": "ee26938da00b173f", "entity_type": "ORGANIZATION", "value": "Чернівецький національний університет", "normalized": "чернівецький національний університет", "source_id": "c45c28112723c377", "confidence": 0.9472811222076416, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 99 |
+
{"entity_id": "4f6771b02873d1a8", "entity_type": "ORGANIZATION", "value": "microsoft", "normalized": "microsoft", "source_id": "4fa8387b8db3832a", "confidence": 0.965456485748291, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 100 |
+
{"entity_id": "0fedc5641985ff4f", "entity_type": "ORGANIZATION", "value": "Українська інженерно-педагогічна академія", "normalized": "українська інженерно-педагогічна академія", "source_id": "2319b160a36f1033", "confidence": 0.6156907677650452, "doc_ids": ["d717d12183a88d4b"], "embedding": null}
|
| 101 |
+
{"entity_id": "7aedef9d8d8558b6", "entity_type": "ORGANIZATION", "value": "Медичного факультету", "normalized": "медичного факультету", "source_id": "2319b160a36f1033", "confidence": 0.646909773349762, "doc_ids": ["d717d12183a88d4b"], "embedding": null}
|
| 102 |
+
{"entity_id": "319103bce9ac4ecb", "entity_type": "SPECIALITY", "value": "Комп'ютерні науки", "normalized": "комп'ютерні науки", "source_id": "c8a70c290d054a7f", "confidence": 0.8487199544906616, "doc_ids": ["29d505a0536ac998", "d782275144689a32", "d5bf12aea01ab74a", "906125f79a596b75", "f33f68f0f3885639"], "embedding": null}
|
| 103 |
+
{"entity_id": "3324747a90504200", "entity_type": "SPECIALITY", "value": "519 зі змінами та доповненнями від", "normalized": "519 зі змінами та доповненнями від", "source_id": "55caa1ade06bdf64", "confidence": 0.85, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 104 |
+
{"entity_id": "54e5434714345859", "entity_type": "SPECIALITY", "value": "Computer science", "normalized": "computer science", "source_id": "7d2fabee274a867b", "confidence": 0.7074233293533325, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 105 |
+
{"entity_id": "4608f9da19534fdf", "entity_type": "SPECIALITY", "value": "240 кредитів ЄКТС", "normalized": "240 кредитів єктс", "source_id": "7d2fabee274a867b", "confidence": 0.85, "doc_ids": ["b66612a925339457", "59990e00fb6dc958", "f33f68f0f3885639"], "embedding": null}
|
| 106 |
+
{"entity_id": "71016125a13c0e0d", "entity_type": "SPECIALITY", "value": "Інформаційні технології", "normalized": "інформаційні технології", "source_id": "bef65c47c7457b9b", "confidence": 0.7080715894699097, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 107 |
+
{"entity_id": "1b9a3e20d4e2de7d", "entity_type": "SPECIALITY", "value": "Фахівець з інформаційних технологій", "normalized": "фахівець з інформаційних технологій", "source_id": "7ba9bd963e52750c", "confidence": 0.745633602142334, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 108 |
+
{"entity_id": "aea9a80c0b30ac81", "entity_type": "SPECIALITY", "value": "Фахівець з розроблення комп’ютерних програм", "normalized": "фахівець з розроблення комп'ютерних програм", "source_id": "7ba9bd963e52750c", "confidence": 0.6395883560180664, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 109 |
+
{"entity_id": "206d4facf8046312", "entity_type": "SPECIALITY", "value": "Програмні компетентності", "normalized": "програмні компетентності", "source_id": "070c0b50ef0891a7", "confidence": 0.5735814571380615, "doc_ids": ["b66612a925339457", "59990e00fb6dc958", "f33f68f0f3885639"], "embedding": null}
|
| 110 |
+
{"entity_id": "272fc3de5af4b271", "entity_type": "SPECIALITY", "value": "Інтегральна компетентність", "normalized": "інтегральна компетентність", "source_id": "f6ee4dbcc8c3594d", "confidence": 0.8505434989929199, "doc_ids": ["b66612a925339457", "59990e00fb6dc958", "f33f68f0f3885639"], "embedding": null}
|
| 111 |
+
{"entity_id": "0968094acb78c1ad", "entity_type": "SPECIALITY", "value": "системного мислення", "normalized": "системного мислення", "source_id": "d89f7445018e510e", "confidence": 0.5229406356811523, "doc_ids": ["d5bf12aea01ab74a", "f33f68f0f3885639"], "embedding": null}
|
| 112 |
+
{"entity_id": "97722021b1a2792b", "entity_type": "SPECIALITY", "value": "логічного мислення", "normalized": "логічного мислення", "source_id": "c9333c7f250d9980", "confidence": 0.6722881197929382, "doc_ids": ["906125f79a596b75", "f33f68f0f3885639"], "embedding": null}
|
| 113 |
+
{"entity_id": "a7c0a7066c2e9018", "entity_type": "SPECIALITY", "value": "Здатність застосовувати методи та засоби забезпечення інформаційної безпеки", "normalized": "здатність застосовувати методи та засоби забезпечення інформаційної безпеки", "source_id": "17fca38c5fe9bbe2", "confidence": 0.6174511909484863, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 114 |
+
{"entity_id": "206d761f50f8f06d", "entity_type": "SPECIALITY", "value": "Здатність до аналізу та функціонального моделювання бізнес- процесів", "normalized": "здатність до аналізу та функціонального моделювання бізнес- процесів", "source_id": "17fca38c5fe9bbe2", "confidence": 0.5093329548835754, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 115 |
+
{"entity_id": "9b1d2e56d20f1ed0", "entity_type": "SPECIALITY", "value": "обчислювального інтелекту", "normalized": "обчислювального інтелекту", "source_id": "b29667928e818bb6", "confidence": 0.5286226272583008, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 116 |
+
{"entity_id": "1d00762131f7927c", "entity_type": "SPECIALITY", "value": "машинного навчання", "normalized": "машинного навчання", "source_id": "b29667928e818bb6", "confidence": 0.7234756350517273, "doc_ids": ["29d505a0536ac998", "d782275144689a32", "f33f68f0f3885639"], "embedding": null}
|
| 117 |
+
{"entity_id": "0d39e7b35c5a076e", "entity_type": "SPECIALITY", "value": "нейромережевої та нечіткої обробки даних", "normalized": "нейромережевої та нечіткої обробки даних", "source_id": "b29667928e818bb6", "confidence": 0.5430352687835693, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 118 |
+
{"entity_id": "fc7e425c76ed0914", "entity_type": "SPECIALITY", "value": "генетичного та еволюційного програмування", "normalized": "генетичного та еволюційного програмування", "source_id": "b29667928e818bb6", "confidence": 0.6436671614646912, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 119 |
+
{"entity_id": "245ea398e76f2b9d", "entity_type": "SPECIALITY", "value": "розпізнавання", "normalized": "розпізнавання", "source_id": "b29667928e818bb6", "confidence": 0.699677586555481, "doc_ids": ["d5bf12aea01ab74a", "f33f68f0f3885639"], "embedding": null}
|
| 120 |
+
{"entity_id": "3d510e2163671925", "entity_type": "SPECIALITY", "value": "прогнозування", "normalized": "прогнозування", "source_id": "b29667928e818bb6", "confidence": 0.5668375492095947, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 121 |
+
{"entity_id": "b5dc06eb5bd93776", "entity_type": "SPECIALITY", "value": "класифікації", "normalized": "класифікації", "source_id": "b29667928e818bb6", "confidence": 0.6198435425758362, "doc_ids": ["d5bf12aea01ab74a", "f33f68f0f3885639"], "embedding": null}
|
| 122 |
+
{"entity_id": "a0afc115cc35343d", "entity_type": "SPECIALITY", "value": "стохастичного програмування", "normalized": "стохастичного програмування", "source_id": "c0031f76c1b5be0a", "confidence": 0.5148607492446899, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 123 |
+
{"entity_id": "922d634207f1bd3b", "entity_type": "SPECIALITY", "value": "навчальні дисципліни", "normalized": "навчальні дисципліни", "source_id": "3725adad1a375aae", "confidence": 0.81943678855896, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 124 |
+
{"entity_id": "06e267506e80e944", "entity_type": "SPECIALITY", "value": "практики", "normalized": "практики", "source_id": "3725adad1a375aae", "confidence": 0.6379265785217285, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 125 |
+
{"entity_id": "615aeabe0d2e90c4", "entity_type": "SPECIALITY", "value": "Об’єктно-орієнтоване програмування", "normalized": "об'єктно-орієнтоване програмування", "source_id": "ef794c82b54bd7fa", "confidence": 0.5231004357337952, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 126 |
+
{"entity_id": "34c3561b548abce9", "entity_type": "SPECIALITY", "value": "240 Примітка", "normalized": "240 примітка", "source_id": "d95be0bb9a2a5e85", "confidence": 0.85, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 127 |
+
{"entity_id": "8fe70805d62442fe", "entity_type": "SPECIALITY", "value": "комп’ютерної інженерії", "normalized": "комп'ютерної інженерії", "source_id": "a85130ce52e9d5ad", "confidence": 0.9279636144638062, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 128 |
+
{"entity_id": "c4f97b6cbc55754d", "entity_type": "SPECIALITY", "value": "Computer Engineering", "normalized": "computer engineering", "source_id": "83fdca75c9a45216", "confidence": 0.7686312198638916, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 129 |
+
{"entity_id": "4ce0475ef4976b1f", "entity_type": "SPECIALITY", "value": "302 від", "normalized": "302 від", "source_id": "83fdca75c9a45216", "confidence": 0.85, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 130 |
+
{"entity_id": "867422b0430e1564", "entity_type": "SPECIALITY", "value": "надбання спеціальних компетентностей", "normalized": "надбання спеціальних компетентностей", "source_id": "07abb3f4f4f1acfc", "confidence": 0.5332889556884766, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 131 |
+
{"entity_id": "4c8642012c1b3a70", "entity_type": "SPECIALITY", "value": "проектування", "normalized": "проектування", "source_id": "5d3f888365936987", "confidence": 0.7763451933860779, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 132 |
+
{"entity_id": "05fd3595bf5c36de", "entity_type": "SPECIALITY", "value": "інформаційні процеси", "normalized": "інформаційні процеси", "source_id": "60ad204aaed1308f", "confidence": 0.5113092064857483, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 133 |
+
{"entity_id": "5d79062899f094ab", "entity_type": "SPECIALITY", "value": "інформаційними технологіями", "normalized": "інформаційними технологіями", "source_id": "5d3f888365936987", "confidence": 0.8238152265548706, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 134 |
+
{"entity_id": "825552a12c15e926", "entity_type": "SPECIALITY", "value": "комп’ютерні системи", "normalized": "комп'ютерні системи", "source_id": "a85130ce52e9d5ad", "confidence": 0.6875548362731934, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 135 |
+
{"entity_id": "00cf239d3fdd3c0b", "entity_type": "SPECIALITY", "value": "ІТ-індустрія", "normalized": "іт-індустрія", "source_id": "a85130ce52e9d5ad", "confidence": 0.5096610188484192, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 136 |
+
{"entity_id": "9696f2942cbcefbf", "entity_type": "SPECIALITY", "value": "Професіонали в галузі фізичних, математичних та технічних наук", "normalized": "професіонали в галузі фізичних, математичних та технічних наук", "source_id": "ba847fbe086d51b1", "confidence": 0.6725888252258301, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 137 |
+
{"entity_id": "6b2065ed53799d5d", "entity_type": "SPECIALITY", "value": "Технічні фахівці в галузі прикладних наук та техніки", "normalized": "технічні фахівці в галузі прикладних наук та техніки", "source_id": "e3dc8b7767957357", "confidence": 0.5105443000793457, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 138 |
+
{"entity_id": "bb0e47ca7d64c3a2", "entity_type": "SPECIALITY", "value": "213 Професіонали в галузі обчислень", "normalized": "213 професіонали в галузі обчислень", "source_id": "e3dc8b7767957357", "confidence": 0.85, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 139 |
+
{"entity_id": "76840deb548f6613", "entity_type": "SPECIALITY", "value": "131 Професіонали в галузі обчислювальних сист", "normalized": "131 професіонали в галузі обчислювальних сист", "source_id": "e3dc8b7767957357", "confidence": 0.85, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 140 |
+
{"entity_id": "27539ade77e10aa5", "entity_type": "SPECIALITY", "value": "132 Професіонали в галузі програмування", "normalized": "132 професіонали в галузі програмування", "source_id": "e3dc8b7767957357", "confidence": 0.85, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 141 |
+
{"entity_id": "be6b689037cfd3f5", "entity_type": "SPECIALITY", "value": "139 Професіонали в інших галузях обчислень", "normalized": "139 професіонали в інших галузях обчислень", "source_id": "e3dc8b7767957357", "confidence": 0.85, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 142 |
+
{"entity_id": "f09ce6171ec29ca4", "entity_type": "SPECIALITY", "value": "312 Технічні фахівці в галузі обчислювальної", "normalized": "312 технічні фахівці в галузі обчислювальної", "source_id": "e3dc8b7767957357", "confidence": 0.85, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 143 |
+
{"entity_id": "b8ade73e2d5d467f", "entity_type": "SPECIALITY", "value": "загальних та професійних компетентностей", "normalized": "загальних та професійних компетентностей", "source_id": "22cdbb71f4bf1575", "confidence": 0.5386465787887573, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 144 |
+
{"entity_id": "8f6a3bfa88668fda", "entity_type": "SPECIALITY", "value": "математичного та комп’ютерного моделювання", "normalized": "математичного та комп'ютерного моделювання", "source_id": "22cdbb71f4bf1575", "confidence": 0.7098082900047302, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 145 |
+
{"entity_id": "a96b08dadeee4ed1", "entity_type": "SPECIALITY", "value": "володіння інформаційними технологіями", "normalized": "володіння інформаційними технологіями", "source_id": "22cdbb71f4bf1575", "confidence": 0.7292240262031555, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 146 |
+
{"entity_id": "b66f51090366d95e", "entity_type": "SPECIALITY", "value": "професійними прикладними програмами", "normalized": "професійними прикладними програмами", "source_id": "22cdbb71f4bf1575", "confidence": 0.5156592726707458, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 147 |
+
{"entity_id": "701e78c42935ac92", "entity_type": "SPECIALITY", "value": "сучасними мовами програмування", "normalized": "сучасними мовами програмування", "source_id": "22cdbb71f4bf1575", "confidence": 0.5607941746711731, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 148 |
+
{"entity_id": "b595cc5c5dbc9f27", "entity_type": "SPECIALITY", "value": "спеціальності", "normalized": "спеціальності", "source_id": "4f19ac3696aba26d", "confidence": 0.761815071105957, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 149 |
+
{"entity_id": "f8d71c90965bef12", "entity_type": "SPECIALITY", "value": "Загальні компетентності", "normalized": "загальні компетентності", "source_id": "40cabbdcd801245c", "confidence": 0.7151027321815491, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 150 |
+
{"entity_id": "322cec6ce3bc2db5", "entity_type": "SPECIALITY", "value": "ЗК 2", "normalized": "зк 2", "source_id": "b3bac256cd6b3f60", "confidence": 0.5300474166870117, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 151 |
+
{"entity_id": "ebfc6922e8102c64", "entity_type": "SPECIALITY", "value": "Фахові компетентності", "normalized": "фахові компетентності", "source_id": "40cabbdcd801245c", "confidence": 0.6449676752090454, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 152 |
+
{"entity_id": "b782f37083fcb21e", "entity_type": "SPECIALITY", "value": "ФК 1", "normalized": "фк 1", "source_id": "b3bac256cd6b3f60", "confidence": 0.5035998821258545, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 153 |
+
{"entity_id": "45aa5067f79407c7", "entity_type": "SPECIALITY", "value": "ФК 2", "normalized": "фк 2", "source_id": "b3bac256cd6b3f60", "confidence": 0.5034068822860718, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 154 |
+
{"entity_id": "bae673da8da75f34", "entity_type": "SPECIALITY", "value": "Автоматизація та комп’ютерно- інтегровані технології", "normalized": "автоматизація та комп'ютерно- інтегровані технології", "source_id": "cd050aec27af9add", "confidence": 0.7160797119140625, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 155 |
+
{"entity_id": "0c36a78449184e85", "entity_type": "SPECIALITY", "value": "603 від", "normalized": "603 від", "source_id": "e640d430651812eb", "confidence": 0.85, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 156 |
+
{"entity_id": "dfa8525adddea21c", "entity_type": "SPECIALITY", "value": "372 від", "normalized": "372 від", "source_id": "e640d430651812eb", "confidence": 0.85, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 157 |
+
{"entity_id": "0e97e28b90683458", "entity_type": "SPECIALITY", "value": "023 Передумови Для здобуття освітнього ступен", "normalized": "023 передумови для здобуття освітнього ступен", "source_id": "e640d430651812eb", "confidence": 0.85, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 158 |
+
{"entity_id": "43f1fa06d0fb0bd7", "entity_type": "SPECIALITY", "value": "технічне", "normalized": "технічне", "source_id": "51a1661d308c3929", "confidence": 0.5601323246955872, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 159 |
+
{"entity_id": "3159a3e3e182afdb", "entity_type": "SPECIALITY", "value": "математичне", "normalized": "математичне", "source_id": "51a1661d308c3929", "confidence": 0.5146043300628662, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 160 |
+
{"entity_id": "0969f1b2971aa784", "entity_type": "SPECIALITY", "value": "моделювання", "normalized": "моделювання", "source_id": "5d3f888365936987", "confidence": 0.6522191166877747, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 161 |
+
{"entity_id": "d9262d20fe39c685", "entity_type": "SPECIALITY", "value": "знаннями технічних засобів автоматизації", "normalized": "знаннями технічних засобів автоматизації", "source_id": "5d3f888365936987", "confidence": 0.5531874299049377, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 162 |
+
{"entity_id": "2e8b38e56500a7a3", "entity_type": "SPECIALITY", "value": "інженерії", "normalized": "інженерії", "source_id": "2fdeb829268a59b3", "confidence": 0.6243505477905273, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 163 |
+
{"entity_id": "c8c2ed4483d9e387", "entity_type": "SPECIALITY", "value": "робототехніки", "normalized": "робототехніки", "source_id": "2fdeb829268a59b3", "confidence": 0.8151633739471436, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 164 |
+
{"entity_id": "6a877c5c22acd65d", "entity_type": "SPECIALITY", "value": "електротехніку", "normalized": "електротехніку", "source_id": "d44febd45afddc60", "confidence": 0.9148097634315491, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 165 |
+
{"entity_id": "9bd3d8195311b86a", "entity_type": "SPECIALITY", "value": "Професіонали в галузі обчислювальних систем", "normalized": "професіонали в галузі обчислювальних систем", "source_id": "ba847fbe086d51b1", "confidence": 0.6121425628662109, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 166 |
+
{"entity_id": "9b44aac7b71f3b81", "entity_type": "SPECIALITY", "value": "Розробники обчислювальних систем", "normalized": "розробники обчислювальних систем", "source_id": "ba847fbe086d51b1", "confidence": 0.5048727989196777, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 167 |
+
{"entity_id": "f720170162ea75a3", "entity_type": "SPECIALITY", "value": "фізику", "normalized": "фізику", "source_id": "d44febd45afddc60", "confidence": 0.9121578931808472, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 168 |
+
{"entity_id": "eac8143bbdabd053", "entity_type": "SPECIALITY", "value": "схемотехніку", "normalized": "схемотехніку", "source_id": "d44febd45afddc60", "confidence": 0.608163058757782, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 169 |
+
{"entity_id": "c0b73e547fb01a0d", "entity_type": "SPECIALITY", "value": "мікропроцесорну техніку", "normalized": "мікропроцесорну техніку", "source_id": "d44febd45afddc60", "confidence": 0.8341336250305176, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 170 |
+
{"entity_id": "ca36425b9fea1047", "entity_type": "SPECIALITY", "value": "математичного моделювання", "normalized": "математичного моделювання", "source_id": "b5afffb06306c471", "confidence": 0.8945973515510559, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 171 |
+
{"entity_id": "eccdae69f659f373", "entity_type": "SPECIALITY", "value": "автоматизованого проектування", "normalized": "автоматизованого проектування", "source_id": "b5afffb06306c471", "confidence": 0.8087589740753174, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 172 |
+
{"entity_id": "5f2a79a8d81b577c", "entity_type": "SPECIALITY", "value": "керування базами даних", "normalized": "керування базами даних", "source_id": "b5afffb06306c471", "confidence": 0.6676744818687439, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 173 |
+
{"entity_id": "1932d4efc6aed553", "entity_type": "SPECIALITY", "value": "методів комп’ютерної графіки", "normalized": "методів комп'ютерної графіки", "source_id": "b5afffb06306c471", "confidence": 0.6260248422622681, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 174 |
+
{"entity_id": "c13c1778434d4401", "entity_type": "SPECIALITY", "value": "177 Вибіркові компоненти Вибіркові загальноун", "normalized": "177 вибіркові компоненти вибіркові загальноун", "source_id": "239cfef495d5fcb0", "confidence": 0.85, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 175 |
+
{"entity_id": "6ad061c8308163a4", "entity_type": "SPECIALITY", "value": "автоматизації", "normalized": "автоматизації", "source_id": "e552699b03b077fe", "confidence": 0.6890305876731873, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 176 |
+
{"entity_id": "5c3c6ed4d533a2c1", "entity_type": "SPECIALITY", "value": "026 навчальний рік", "normalized": "026 навчальний рік", "source_id": "c8a70c290d054a7f", "confidence": 0.85, "doc_ids": ["906125f79a596b75", "d5bf12aea01ab74a"], "embedding": null}
|
| 177 |
+
{"entity_id": "581d47447ce9c3f4", "entity_type": "SPECIALITY", "value": "025 року", "normalized": "025 року", "source_id": "c8a70c290d054a7f", "confidence": 0.85, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 178 |
+
{"entity_id": "97cd6dc5749e6b61", "entity_type": "SPECIALITY", "value": "алгоритмічним мисленням", "normalized": "алгоритмічним мисленням", "source_id": "c9333c7f250d9980", "confidence": 0.5652839541435242, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 179 |
+
{"entity_id": "c41170818cb6c11a", "entity_type": "SPECIALITY", "value": "ACID", "normalized": "acid", "source_id": "1f7deed0ad1b97b8", "confidence": 0.5918631553649902, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 180 |
+
{"entity_id": "9046fecf16ca5776", "entity_type": "SPECIALITY", "value": "атомарність", "normalized": "атомарність", "source_id": "1f7deed0ad1b97b8", "confidence": 0.6842527985572815, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 181 |
+
{"entity_id": "60d10086ca90e035", "entity_type": "SPECIALITY", "value": "узгодженість", "normalized": "узгодженість", "source_id": "1f7deed0ad1b97b8", "confidence": 0.5353814959526062, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 182 |
+
{"entity_id": "d82a8d307fd8eb65", "entity_type": "SPECIALITY", "value": "ізольованість", "normalized": "ізольованість", "source_id": "1f7deed0ad1b97b8", "confidence": 0.6678354740142822, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 183 |
+
{"entity_id": "cf56ebddf5a35607", "entity_type": "SPECIALITY", "value": "довговічність", "normalized": "довговічність", "source_id": "1f7deed0ad1b97b8", "confidence": 0.6776419878005981, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 184 |
+
{"entity_id": "00cd6ec8a2ea3f44", "entity_type": "SPECIALITY", "value": "Вивчення DCL", "normalized": "вивчення dcl", "source_id": "6c9896a73eb9dadb", "confidence": 0.6476343274116516, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 185 |
+
{"entity_id": "48dc9a385ba4fbd8", "entity_type": "SPECIALITY", "value": "Теоретичні завдання", "normalized": "теоретичні завдання", "source_id": "20120b0b0d706b71", "confidence": 0.534032940864563, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 186 |
+
{"entity_id": "0e25ef31b4529625", "entity_type": "SPECIALITY", "value": "Практичні завдання", "normalized": "практичні завдання", "source_id": "20120b0b0d706b71", "confidence": 0.5487654209136963, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 187 |
+
{"entity_id": "6b557e862a4e7a83", "entity_type": "SPECIALITY", "value": "100 відмінно зараховано", "normalized": "100 відмінно зараховано", "source_id": "20120b0b0d706b71", "confidence": 0.85, "doc_ids": ["29d505a0536ac998", "906125f79a596b75", "d782275144689a32", "d5bf12aea01ab74a"], "embedding": null}
|
| 188 |
+
{"entity_id": "991851bc4eb66ce7", "entity_type": "SPECIALITY", "value": "025 навчальний рік Програму обговорено та рек", "normalized": "025 навчальний рік програму обговорено та рек", "source_id": "ff3d29a7c347f81f", "confidence": 0.85, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 189 |
+
{"entity_id": "0f8293abf49fb235", "entity_type": "SPECIALITY", "value": "024 року", "normalized": "024 року", "source_id": "ff3d29a7c347f81f", "confidence": 0.85, "doc_ids": ["d782275144689a32", "d5bf12aea01ab74a"], "embedding": null}
|
| 190 |
+
{"entity_id": "ed929af314e5c217", "entity_type": "SPECIALITY", "value": "теоретичних знань", "normalized": "теоретичних знань", "source_id": "256daeeb4709366c", "confidence": 0.5338117480278015, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 191 |
+
{"entity_id": "60102cdf4d854f34", "entity_type": "SPECIALITY", "value": "практичних навичок", "normalized": "практичних навичок", "source_id": "256daeeb4709366c", "confidence": 0.6101369261741638, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 192 |
+
{"entity_id": "7d417c302db221f8", "entity_type": "SPECIALITY", "value": "самостійна робота", "normalized": "самостійна робота", "source_id": "11d0b9f27311fcf0", "confidence": 0.6309128403663635, "doc_ids": ["29d505a0536ac998", "d782275144689a32", "d5bf12aea01ab74a"], "embedding": null}
|
| 193 |
+
{"entity_id": "a0c1c19f89128b5b", "entity_type": "SPECIALITY", "value": "machine-learning-introduction", "normalized": "machine-learning-introduction", "source_id": "42aff4b4c6cfc199", "confidence": 0.7438027858734131, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 194 |
+
{"entity_id": "c73573bcb8a98942", "entity_type": "SPECIALITY", "value": "Поточний контроль", "normalized": "поточний контроль", "source_id": "11d0b9f27311fcf0", "confidence": 0.5207484364509583, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 195 |
+
{"entity_id": "0de8a786de8a293a", "entity_type": "SPECIALITY", "value": "індивідуальні завдання", "normalized": "індивідуальні завдання", "source_id": "11d0b9f27311fcf0", "confidence": 0.5213707089424133, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 196 |
+
{"entity_id": "8ef745e21a875f72", "entity_type": "LAW_REFERENCE", "value": "Закону України «Про вищу освіту»", "normalized": "закону україни «про вищу освіту»", "source_id": "55caa1ade06bdf64", "confidence": 0.85, "doc_ids": ["b66612a925339457", "59990e00fb6dc958", "f33f68f0f3885639"], "embedding": null}
|
| 197 |
+
{"entity_id": "f9491e067e6094a3", "entity_type": "LAW_REFERENCE", "value": "Закону України «Про наукову і науково-технічну діяльність»", "normalized": "закону україни «про наукову і науково-технічну діяльність»", "source_id": "55caa1ade06bdf64", "confidence": 0.85, "doc_ids": ["b66612a925339457", "59990e00fb6dc958", "f33f68f0f3885639"], "embedding": null}
|
| 198 |
+
{"entity_id": "44ff037b4be33247", "entity_type": "EDU_LEVEL", "value": "Перший", "normalized": "перший", "source_id": "7d2fabee274a867b", "confidence": 0.5051471590995789, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 199 |
+
{"entity_id": "b88bcdbd0c134780", "entity_type": "EDU_LEVEL", "value": "бакалавр", "normalized": "бакалавр", "source_id": "9e41a312d020a671", "confidence": 0.9087554812431335, "doc_ids": ["b66612a925339457", "59990e00fb6dc958", "f33f68f0f3885639"], "embedding": null}
|
| 200 |
+
{"entity_id": "48e8eb52059e5bab", "entity_type": "EDU_LEVEL", "value": "дворівневою", "normalized": "дворівневою", "source_id": "98bae9ab7f84fab6", "confidence": 0.5992786884307861, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 201 |
+
{"entity_id": "5500e778ecf224d5", "entity_type": "EDU_LEVEL", "value": "Університетські студії", "normalized": "університетські студії", "source_id": "ef794c82b54bd7fa", "confidence": 0.5479199886322021, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 202 |
+
{"entity_id": "346996767a29a82b", "entity_type": "EDU_LEVEL", "value": "першим", "normalized": "першим", "source_id": "cd050aec27af9add", "confidence": 0.5464401245117188, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 203 |
+
{"entity_id": "374bf375044352a6", "entity_type": "EDU_LEVEL", "value": "вищої освіти", "normalized": "вищої освіти", "source_id": "ca545a46260dd835", "confidence": 0.9126794338226318, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 204 |
+
{"entity_id": "a900f9c64ad40b8c", "entity_type": "EDU_LEVEL", "value": "другим", "normalized": "другим", "source_id": "22cdbb71f4bf1575", "confidence": 0.6678771376609802, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 205 |
+
{"entity_id": "ffe98e727478892d", "entity_type": "EDU_LEVEL", "value": "однієї або більше академічних груп", "normalized": "однієї або більше академічних груп", "source_id": "ad982e91da077a15", "confidence": 0.6272289156913757, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 206 |
+
{"entity_id": "6e95386ba46783ae", "entity_type": "EDU_LEVEL", "value": "II семестр", "normalized": "ii семестр", "source_id": "5834b4ab5eb62ddc", "confidence": 0.8962454199790955, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 207 |
+
{"entity_id": "905eb46730ee5067", "entity_type": "EDU_LEVEL", "value": "1-й семестр", "normalized": "1-й семестр", "source_id": "d45705c9d6499369", "confidence": 0.502380907535553, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 208 |
+
{"entity_id": "4e3a6a778bd02a9a", "entity_type": "DOCUMENT", "value": "Диплом бакалавра", "normalized": "диплом бакалавра", "source_id": "7d2fabee274a867b", "confidence": 0.5231388807296753, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 209 |
+
{"entity_id": "913d7e8e7c2f67f4", "entity_type": "DOCUMENT", "value": "міжнародних договорів", "normalized": "міжнародних договорів", "source_id": "c0031f76c1b5be0a", "confidence": 0.7509618997573853, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 210 |
+
{"entity_id": "055172975b24840d", "entity_type": "DOCUMENT", "value": "Сертифікат про акредитацію", "normalized": "сертифікат про акредитацію", "source_id": "83fdca75c9a45216", "confidence": 0.5850769281387329, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 211 |
+
{"entity_id": "5893788174cc1112", "entity_type": "DOCUMENT", "value": "проектна документація", "normalized": "проектна документація", "source_id": "60ad204aaed1308f", "confidence": 0.9113494157791138, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 212 |
+
{"entity_id": "19f4b16b62f7d2ac", "entity_type": "DOCUMENT", "value": "стандарти", "normalized": "стандарти", "source_id": "60ad204aaed1308f", "confidence": 0.8386316299438477, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 213 |
+
{"entity_id": "acdd1ddf99d6466e", "entity_type": "DOCUMENT", "value": "галузеві стандарти", "normalized": "галузеві стандарти", "source_id": "a85130ce52e9d5ad", "confidence": 0.7794632315635681, "doc_ids": ["b66612a925339457"], "embedding": null}
|
| 214 |
+
{"entity_id": "699ec9c1516b9078", "entity_type": "DOCUMENT", "value": "ДК 003:2010", "normalized": "дк 003:2010", "source_id": "ba847fbe086d51b1", "confidence": 0.6501777768135071, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 215 |
+
{"entity_id": "86fd6e5fe8e9acc5", "entity_type": "DOCUMENT", "value": "підручників та конспектів", "normalized": "підручників та конспектів", "source_id": "000ecf882de345cd", "confidence": 0.5020672082901001, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 216 |
+
{"entity_id": "730864a279fa38ed", "entity_type": "DOCUMENT", "value": "міжнародних стандартів", "normalized": "міжнародних стандартів", "source_id": "b5afffb06306c471", "confidence": 0.6452289819717407, "doc_ids": ["59990e00fb6dc958"], "embedding": null}
|
| 217 |
+
{"entity_id": "43c51e4e3af97e3b", "entity_type": "DOCUMENT", "value": "військовий квиток або приписне посвідчення", "normalized": "військовий квиток або приписне посвідчення", "source_id": "afedd4c94bd030de", "confidence": 0.692277193069458, "doc_ids": ["cf69446fcc226f76"], "embedding": null}
|
| 218 |
+
{"entity_id": "cef1dffeff86b2ad", "entity_type": "DOCUMENT", "value": "протокол № 2", "normalized": "протокол № 2", "source_id": "ff3d29a7c347f81f", "confidence": 0.7515961527824402, "doc_ids": ["29d505a0536ac998", "906125f79a596b75", "d782275144689a32", "d5bf12aea01ab74a"], "embedding": null}
|
| 219 |
+
{"entity_id": "48c60bc68fad6568", "entity_type": "DOCUMENT", "value": "протокол № 1", "normalized": "протокол № 1", "source_id": "51c002d66b88ec4f", "confidence": 0.7710016369819641, "doc_ids": ["29d505a0536ac998", "906125f79a596b75"], "embedding": null}
|
| 220 |
+
{"entity_id": "c0c976dc9f08ee09", "entity_type": "DOCUMENT", "value": "ФК03", "normalized": "��к03", "source_id": "c9333c7f250d9980", "confidence": 0.6271950006484985, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 221 |
+
{"entity_id": "0fb142c0536723a5", "entity_type": "DOCUMENT", "value": "ПРН04", "normalized": "прн04", "source_id": "c9333c7f250d9980", "confidence": 0.698990523815155, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 222 |
+
{"entity_id": "4edfea8258d94845", "entity_type": "DOCUMENT", "value": "ПРН09", "normalized": "прн09", "source_id": "c9333c7f250d9980", "confidence": 0.638883113861084, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 223 |
+
{"entity_id": "86eced7f7b77fe6a", "entity_type": "DOCUMENT", "value": "тести", "normalized": "тести", "source_id": "c1f0d26d3293af30", "confidence": 0.545635461807251, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 224 |
+
{"entity_id": "b18f2b51c06200ef", "entity_type": "DOCUMENT", "value": "технічним завданням", "normalized": "технічним завданням", "source_id": "b58de4bac0bacaa4", "confidence": 0.9499786496162415, "doc_ids": ["906125f79a596b75", "d5bf12aea01ab74a"], "embedding": null}
|
| 225 |
+
{"entity_id": "7f6f05d5d2de017e", "entity_type": "DOCUMENT", "value": "протокол № 3", "normalized": "протокол № 3", "source_id": "3fa3e2126481683a", "confidence": 0.7115033268928528, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 226 |
+
{"entity_id": "50eaaf0454fed8de", "entity_type": "DOCUMENT", "value": "Звіти", "normalized": "звіти", "source_id": "47cac461812d84b6", "confidence": 0.6624485850334167, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 227 |
+
{"entity_id": "69a63c2f8c3447d6", "entity_type": "DOCUMENT", "value": "звіт", "normalized": "звіт", "source_id": "4377f6786a5a41fd", "confidence": 0.9271696209907532, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 228 |
+
{"entity_id": "784c19bb97840e78", "entity_type": "DOCUMENT", "value": "роботі", "normalized": "роботі", "source_id": "3aa93846fb2a2d54", "confidence": 0.5204114317893982, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 229 |
+
{"entity_id": "31746da19135eb78", "entity_type": "DOCUMENT", "value": "текст роботи", "normalized": "текст роботи", "source_id": "86f1ca29b96afbf4", "confidence": 0.5888989567756653, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 230 |
+
{"entity_id": "e9177cbaa806b0f2", "entity_type": "DOCUMENT", "value": "pdf", "normalized": "pdf", "source_id": "618e1da5930c5ace", "confidence": 0.7660239338874817, "doc_ids": ["29d505a0536ac998", "d782275144689a32", "d5bf12aea01ab74a"], "embedding": null}
|
| 231 |
+
{"entity_id": "d06b53ab82edfc5c", "entity_type": "DOCUMENT", "value": "перший", "normalized": "перший", "source_id": "9f7180e50738c510", "confidence": 0.5636366605758667, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 232 |
+
{"entity_id": "3661c2e944d40b82", "entity_type": "DOCUMENT", "value": "Tableau", "normalized": "tableau", "source_id": "ec4e92484f0a5a4e", "confidence": 0.8814036250114441, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 233 |
+
{"entity_id": "ee872eca29098193", "entity_type": "DOCUMENT", "value": "Power BI documentation", "normalized": "power bi documentation", "source_id": "4fa8387b8db3832a", "confidence": 0.8833033442497253, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 234 |
+
{"entity_id": "f74e210776727dd2", "entity_type": "NUMBER", "value": "Кількість кредитів", "normalized": "кількість кредитів", "source_id": "3725adad1a375aae", "confidence": 0.6140317916870117, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 235 |
+
{"entity_id": "bd8c382a93b83173", "entity_type": "NUMBER", "value": "ЗК01", "normalized": "зк01", "source_id": "82e419ef4b6a4f3a", "confidence": 0.5338224172592163, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 236 |
+
{"entity_id": "db6dd07ac3e3c6b9", "entity_type": "NUMBER", "value": "ЗК05", "normalized": "зк05", "source_id": "82e419ef4b6a4f3a", "confidence": 0.5359683632850647, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 237 |
+
{"entity_id": "8c57b3adbf877d92", "entity_type": "NUMBER", "value": "ЗК06", "normalized": "зк06", "source_id": "82e419ef4b6a4f3a", "confidence": 0.5430353283882141, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 238 |
+
{"entity_id": "c6a80669b78e38d0", "entity_type": "NUMBER", "value": "ЗК09", "normalized": "зк09", "source_id": "82e419ef4b6a4f3a", "confidence": 0.5491345524787903, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 239 |
+
{"entity_id": "76c1b09c6ba5c29d", "entity_type": "NUMBER", "value": "ЗК13", "normalized": "зк13", "source_id": "82e419ef4b6a4f3a", "confidence": 0.5017141699790955, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 240 |
+
{"entity_id": "409f31faf973efbb", "entity_type": "NUMBER", "value": "ЗК14", "normalized": "зк14", "source_id": "82e419ef4b6a4f3a", "confidence": 0.5935254693031311, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 241 |
+
{"entity_id": "41dd209cb0943b81", "entity_type": "NUMBER", "value": "ЗК15", "normalized": "зк15", "source_id": "82e419ef4b6a4f3a", "confidence": 0.5997793674468994, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 242 |
+
{"entity_id": "c8c030a0bccb918c", "entity_type": "NUMBER", "value": "ЗК16", "normalized": "зк16", "source_id": "82e419ef4b6a4f3a", "confidence": 0.6185718774795532, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 243 |
+
{"entity_id": "8106aa9561ab7c2f", "entity_type": "NUMBER", "value": "ФК15", "normalized": "фк15", "source_id": "82e419ef4b6a4f3a", "confidence": 0.5016250014305115, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 244 |
+
{"entity_id": "122b3fe2e920d773", "entity_type": "NUMBER", "value": "ФК16", "normalized": "фк16", "source_id": "82e419ef4b6a4f3a", "confidence": 0.522411584854126, "doc_ids": ["f33f68f0f3885639"], "embedding": null}
|
| 245 |
+
{"entity_id": "e90e15bb3ebc3b46", "entity_type": "NUMBER", "value": "8 докторів наук", "normalized": "8 докторів наук", "source_id": "b2a06019710d6478", "confidence": 0.594826877117157, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 246 |
+
{"entity_id": "948493e60578ae31", "entity_type": "NUMBER", "value": "25 кандидатів наук", "normalized": "25 кандидатів наук", "source_id": "b2a06019710d6478", "confidence": 0.5383371710777283, "doc_ids": ["b66612a925339457", "59990e00fb6dc958"], "embedding": null}
|
| 247 |
+
{"entity_id": "352c95a60b376529", "entity_type": "NUMBER", "value": "10", "normalized": "10", "source_id": "bbb2d6d6809f3ad9", "confidence": 0.841667115688324, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 248 |
+
{"entity_id": "70fbedafe4c03090", "entity_type": "NUMBER", "value": "12 3 2 2 5", "normalized": "12 3 2 2 5", "source_id": "43d1da5288d7920a", "confidence": 0.6851680874824524, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 249 |
+
{"entity_id": "7da8314b0a04d391", "entity_type": "NUMBER", "value": "120", "normalized": "120", "source_id": "43d1da5288d7920a", "confidence": 0.6902284026145935, "doc_ids": ["906125f79a596b75", "d5bf12aea01ab74a"], "embedding": null}
|
| 250 |
+
{"entity_id": "dd9e68038557a4d1", "entity_type": "NUMBER", "value": "120 32 16 16 0 56", "normalized": "120 32 16 16 0 56", "source_id": "43d1da5288d7920a", "confidence": 0.6905953884124756, "doc_ids": ["906125f79a596b75"], "embedding": null}
|
| 251 |
+
{"entity_id": "355b77129964e5ba", "entity_type": "NUMBER", "value": "0 балів", "normalized": "0 балів", "source_id": "9e384c927f76dbf0", "confidence": 0.6238329410552979, "doc_ids": ["29d505a0536ac998", "906125f79a596b75", "d782275144689a32"], "embedding": null}
|
| 252 |
+
{"entity_id": "cb0eaa459b89d023", "entity_type": "NUMBER", "value": "Кількість годин", "normalized": "кількість годин", "source_id": "12639a6d0134394d", "confidence": 0.5649511218070984, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 253 |
+
{"entity_id": "8ea18a7337f8bebc", "entity_type": "NUMBER", "value": "5 балів", "normalized": "5 балів", "source_id": "5a49d50c4b784809", "confidence": 0.7059215903282166, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 254 |
+
{"entity_id": "ed1e551c2e2c23f4", "entity_type": "NUMBER", "value": "3 балів", "normalized": "3 балів", "source_id": "9e384c927f76dbf0", "confidence": 0.7172594666481018, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 255 |
+
{"entity_id": "e875322dc2569a22", "entity_type": "NUMBER", "value": "2 балів", "normalized": "2 балів", "source_id": "5a49d50c4b784809", "confidence": 0.6369964480400085, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 256 |
+
{"entity_id": "970141d598e2b973", "entity_type": "NUMBER", "value": "1 бал", "normalized": "1 бал", "source_id": "5a49d50c4b784809", "confidence": 0.6101020574569702, "doc_ids": ["29d505a0536ac998", "d782275144689a32"], "embedding": null}
|
| 257 |
+
{"entity_id": "50b9cb80c26211e4", "entity_type": "NUMBER", "value": "16-20 балів", "normalized": "16-20 балів", "source_id": "99190c180bdba5e1", "confidence": 0.6548917293548584, "doc_ids": ["29d505a0536ac998"], "embedding": null}
|
| 258 |
+
{"entity_id": "6b97bbdb92030366", "entity_type": "NUMBER", "value": "15 балів", "normalized": "15 балів", "source_id": "99190c180bdba5e1", "confidence": 0.6920886635780334, "doc_ids": ["29d505a0536ac998"], "embedding": null}
|
| 259 |
+
{"entity_id": "72c3ce0a8eee26de", "entity_type": "NUMBER", "value": "13-14 балів", "normalized": "13-14 балів", "source_id": "99190c180bdba5e1", "confidence": 0.6282890439033508, "doc_ids": ["29d505a0536ac998"], "embedding": null}
|
| 260 |
+
{"entity_id": "dd19c79674faa7bd", "entity_type": "NUMBER", "value": "5-12 балів", "normalized": "5-12 балів", "source_id": "99190c180bdba5e1", "confidence": 0.6774356961250305, "doc_ids": ["29d505a0536ac998"], "embedding": null}
|
| 261 |
+
{"entity_id": "189fbf0e885b2cd2", "entity_type": "NUMBER", "value": "67", "normalized": "67", "source_id": "cb820b8eef70d979", "confidence": 0.7188746929168701, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 262 |
+
{"entity_id": "7195222d09304114", "entity_type": "NUMBER", "value": "32", "normalized": "32", "source_id": "746f49fefb9761f2", "confidence": 0.5238321423530579, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 263 |
+
{"entity_id": "b39b76920d6df157", "entity_type": "NUMBER", "value": "56", "normalized": "56", "source_id": "1f4d0744f6c2aee8", "confidence": 0.5448365211486816, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
| 264 |
+
{"entity_id": "5187796566007fb7", "entity_type": "NUMBER", "value": "максимальна кількість балів", "normalized": "максимальна кількість балів", "source_id": "799a59af806e688c", "confidence": 0.6007808446884155, "doc_ids": ["d5bf12aea01ab74a"], "embedding": null}
|
data/prebuilt/corpus/relations.jsonl
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"relation_id": "1827f1f1c7f580a7", "source_entity_id": "d6caffc95d4ea230", "target_entity_id": "2396ca191bb74a2f", "relation_type": "related_to", "text_evidence": "вищої освіти, наказ Міністерства освіти і науки України від 10 липня 2019 року № 962 2) Закону Укр", "confidence": 0.88, "source": "proximity"}
|
| 2 |
+
{"relation_id": "4c6afcfff801a5dc", "source_entity_id": "d6caffc95d4ea230", "target_entity_id": "8ef745e21a875f72", "relation_type": "related_to", "text_evidence": "вищої освіти, наказ Міністерства освіти і науки України від 10 липня 2019 року № 962 2) Закону України «Про вищу освіту» від 01.07.2014 р. №", "confidence": 0.796, "source": "proximity"}
|
| 3 |
+
{"relation_id": "dc02a37874f9286e", "source_entity_id": "d6caffc95d4ea230", "target_entity_id": "f9491e067e6094a3", "relation_type": "related_to", "text_evidence": "вищої освіти, наказ Міністерства освіти і науки України від 10 липня 2019 року № 962 2) Закону України «Про вищу освіту» від 01.07.2014 р. № 1556-VII зі змінами та доповненнями. 3) Закону України «Про", "confidence": 0.517, "source": "proximity"}
|
| 4 |
+
{"relation_id": "b5826819e597004b", "source_entity_id": "2396ca191bb74a2f", "target_entity_id": "8ef745e21a875f72", "relation_type": "related_to", "text_evidence": "і науки України від 10 липня 2019 року № 962 2) Закону України «Про вищу освіту» від 01.07.2014 р. №", "confidence": 0.916, "source": "proximity"}
|
| 5 |
+
{"relation_id": "0165927043c66e1e", "source_entity_id": "2396ca191bb74a2f", "target_entity_id": "f9491e067e6094a3", "relation_type": "related_to", "text_evidence": "і науки України від 10 липня 2019 року № 962 2) Закону України «Про вищу освіту» від 01.07.2014 р. № 1556-VII зі змінами та доповненнями. 3) Закону України «Про наукову і науково-технічну діяльність» ", "confidence": 0.637, "source": "proximity"}
|
| 6 |
+
{"relation_id": "1e88980ac452fd11", "source_entity_id": "f9491e067e6094a3", "target_entity_id": "5d47659c01a5bfe0", "relation_type": "related_to", "text_evidence": "та доповненнями. 3) Закону України «Про наукову і науково-технічну діяльність» від 26.11.2015 р. №848-VIII зі змінами та доповненнями. 4) Національної рамки кваліфікацій (Додаток до постанови Кабінету", "confidence": 0.484, "source": "proximity"}
|
| 7 |
+
{"relation_id": "be924b11a8343bb0", "source_entity_id": "5d47659c01a5bfe0", "target_entity_id": "5bce0cd172cf9389", "relation_type": "related_to", "text_evidence": "одаток до постанови Кабінету Міністрів України від 23 листопада 2011 р. № 1341 (в редакц", "confidence": 0.907, "source": "proximity"}
|
| 8 |
+
{"relation_id": "b0f806b4c8e1bb5a", "source_entity_id": "5d47659c01a5bfe0", "target_entity_id": "dd260601f883f45b", "relation_type": "related_to", "text_evidence": "одаток до постанови Кабінету Міністрів України від 23 листопада 2011 р. № 1341 (в редакції постанови Кабінету Міністрів України від 25 червня 2020 р. №519 зі змінами ", "confidence": 0.6639999999999999, "source": "proximity"}
|
| 9 |
+
{"relation_id": "68988bfb7c55dcce", "source_entity_id": "5d47659c01a5bfe0", "target_entity_id": "3324747a90504200", "relation_type": "related_to", "text_evidence": "одаток до постанови Кабінету Міністрів України від 23 листопада 2011 р. № 1341 (в редакції постанови Кабінету Міністрів України від 25 червня 2020 р. №519 зі змінами та доповненнями від 16.01.2024 р.)", "confidence": 0.607, "source": "proximity"}
|
| 10 |
+
{"relation_id": "5604d37959b6d32a", "source_entity_id": "5bce0cd172cf9389", "target_entity_id": "3324747a90504200", "relation_type": "related_to", "text_evidence": "ністрів України від 23 листопада 2011 р. № 1341 (в редакції постанови Кабінету Міністрів України від 25 червня 2020 р. №519 зі змінами та доповненнями від 16.01.2024 р.). 5) ", "confidence": 0.7, "source": "proximity"}
|
| 11 |
+
{"relation_id": "0f903edfb53101aa", "source_entity_id": "dd260601f883f45b", "target_entity_id": "3324747a90504200", "relation_type": "related_to", "text_evidence": "ністрів України від 25 червня 2020 р. №519 зі змінами та доповненнями від 16.01.2024 р.). 5) ", "confidence": 0.9430000000000001, "source": "proximity"}
|
| 12 |
+
{"relation_id": "ca970ef5e5339570", "source_entity_id": "37948bdfda119408", "target_entity_id": "c50aef3ee34c16b0", "relation_type": "related_to", "text_evidence": "Каразіна, кафедра штучного інтелекту та програмного забезпечення, кафедра моделюванн", "confidence": 0.97, "source": "proximity"}
|
| 13 |
+
{"relation_id": "4dda7284c4014a8c", "source_entity_id": "37948bdfda119408", "target_entity_id": "290254ba61badf4d", "relation_type": "related_to", "text_evidence": "Каразіна, кафедра штучного інтелекту та програмного забезпечення, кафедра моделювання систем і технологій, кафедра електронік", "confidence": 0.802, "source": "proximity"}
|
| 14 |
+
{"relation_id": "de7c77d746b6a95a", "source_entity_id": "37948bdfda119408", "target_entity_id": "4fed50ed9d1c3c43", "relation_type": "related_to", "text_evidence": "Каразіна, кафедра штучного інтелекту та програмного забезпечення, кафедра моделювання систем і технологій, кафедра електроніки та управляючих систем Офіційна назва прог", "confidence": 0.679, "source": "proximity"}
|
| 15 |
+
{"relation_id": "1a993c2c43f7d23c", "source_entity_id": "37948bdfda119408", "target_entity_id": "54e5434714345859", "relation_type": "related_to", "text_evidence": "Каразіна, кафедра штучного інтелекту та програмного забезпечення, кафедра моделювання систем і технологій, кафедра електроніки та управляючих систем Офіційна назва програми Комп’ютерні науки Computer ", "confidence": 0.42700000000000005, "source": "proximity"}
|
| 16 |
+
{"relation_id": "9d12db2fc5c17aed", "source_entity_id": "c50aef3ee34c16b0", "target_entity_id": "54e5434714345859", "relation_type": "related_to", "text_evidence": "Каразіна, кафедра штучного інтелекту та програмного забезпечення, кафедра моделювання систем і технологій, кафедра електроніки та управляючих систем Офіційна назва програми Комп’ютерні науки Computer ", "confidence": 0.45699999999999996, "source": "proximity"}
|
| 17 |
+
{"relation_id": "4451cd4aae270f49", "source_entity_id": "290254ba61badf4d", "target_entity_id": "54e5434714345859", "relation_type": "related_to", "text_evidence": "много забезпечення, кафедра моделювання систем і технологій, кафедра електроніки та управляючих систем Офіційна назва програми Комп’ютерні науки Computer science Ступінь вищої освіт", "confidence": 0.625, "source": "proximity"}
|
| 18 |
+
{"relation_id": "85463bfefe7985e3", "source_entity_id": "290254ba61badf4d", "target_entity_id": "44ff037b4be33247", "relation_type": "related_to", "text_evidence": "много забезпечення, кафедра моделювання систем і технологій, кафедра електроніки та управляючих систем Офіційна назва програми Комп’ютерні науки Computer science Ступінь вищої освіти Перший (бакалаврс", "confidence": 0.5110000000000001, "source": "proximity"}
|
| 19 |
+
{"relation_id": "e6c05f555305574f", "source_entity_id": "4fed50ed9d1c3c43", "target_entity_id": "54e5434714345859", "relation_type": "related_to", "text_evidence": "истем і технологій, кафедра електроніки та управляючих систем Офіційна назва програми Комп’ютерні науки Computer science Ступінь вищої освіт", "confidence": 0.748, "source": "proximity"}
|
| 20 |
+
{"relation_id": "fc31a92c0bbc1a6d", "source_entity_id": "4fed50ed9d1c3c43", "target_entity_id": "44ff037b4be33247", "relation_type": "related_to", "text_evidence": "истем і технологій, кафедра електроніки та управляючих систем Офіційна назва програми Комп’ютерні науки Computer science Ступінь вищої освіти Перший (бакалаврський) рів", "confidence": 0.634, "source": "proximity"}
|
| 21 |
+
{"relation_id": "046f30e0b3d07e2f", "source_entity_id": "54e5434714345859", "target_entity_id": "44ff037b4be33247", "relation_type": "related_to", "text_evidence": "и Комп’ютерні науки Computer science Ступінь вищої освіти Перший (бакалаврський) рів", "confidence": 0.886, "source": "proximity"}
|
| 22 |
+
{"relation_id": "bb97645dee55950b", "source_entity_id": "54e5434714345859", "target_entity_id": "4e3a6a778bd02a9a", "relation_type": "related_to", "text_evidence": "и Комп’ютерні науки Computer science Ступінь вищої освіти Перший (бакалаврський) рівень Кваліфікація, що присвоюється Бакалавр з комп’ютерних наук Тип диплому та обсяг освітньої програми Диплом бакала", "confidence": 0.499, "source": "proximity"}
|
| 23 |
+
{"relation_id": "ef4938301c4c615a", "source_entity_id": "44ff037b4be33247", "target_entity_id": "4e3a6a778bd02a9a", "relation_type": "related_to", "text_evidence": "тупінь вищої освіти Перший (бакалаврський) рівень Кваліфікація, що присвоюється Бакалавр з комп’ютерних наук Тип диплому та обсяг освітньої програми Диплом бакалавра, одиничний, 240 кре", "confidence": 0.613, "source": "proximity"}
|
| 24 |
+
{"relation_id": "88167eabb9ee7480", "source_entity_id": "44ff037b4be33247", "target_entity_id": "4608f9da19534fdf", "relation_type": "related_to", "text_evidence": "тупінь вищої освіти Перший (бакалаврський) рівень Кваліфікація, що присвоюється Бакалавр з комп’ютерних наук Тип диплому та обсяг освітньої програми Диплом бакалавра, одиничний, 240 кредитів ЄКТС, тер", "confidence": 0.526, "source": "proximity"}
|
| 25 |
+
{"relation_id": "a0678e4b6bc8283f", "source_entity_id": "4e3a6a778bd02a9a", "target_entity_id": "4608f9da19534fdf", "relation_type": "related_to", "text_evidence": " освітньої програми Диплом бакалавра, одиничний, 240 кредитів ЄКТС, термін навчання: д", "confidence": 0.913, "source": "proximity"}
|
| 26 |
+
{"relation_id": "7bbc9c21e197dfcc", "source_entity_id": "b88bcdbd0c134780", "target_entity_id": "206d4facf8046312", "relation_type": "related_to", "text_evidence": "ліфікаційної роботи бакалавра). 6 – Програмні компетентності Інтегральна компете", "confidence": 0.952, "source": "proximity"}
|
| 27 |
+
{"relation_id": "7fa00d06f813d8f2", "source_entity_id": "b88bcdbd0c134780", "target_entity_id": "272fc3de5af4b271", "relation_type": "related_to", "text_evidence": "ліфікаційної роботи бакалавра). 6 – Програмні компетентності Інтегральна компетентність Здатність розв’язув", "confidence": 0.877, "source": "proximity"}
|
| 28 |
+
{"relation_id": "ec6a693cd64eb9be", "source_entity_id": "913d7e8e7c2f67f4", "target_entity_id": "cc66c883cdb05b6c", "relation_type": "related_to", "text_evidence": "авчання на підставі міжнародних договорів на умовах, визначених цими договорами, а також договорів, укладених навчальним закладом із зарубіжними навч", "confidence": 0.73, "source": "proximity"}
|
| 29 |
+
{"relation_id": "5f6ec27c1852d3db", "source_entity_id": "922d634207f1bd3b", "target_entity_id": "f74e210776727dd2", "relation_type": "related_to", "text_evidence": "освітньої програми (навчальні дисципліни, курсові проекти (роботи), практики, кваліфікаційна робота) Кількість кредитів Форма підсумкового ", "confidence": 0.757, "source": "proximity"}
|
| 30 |
+
{"relation_id": "155e6c0cf3e20acb", "source_entity_id": "06e267506e80e944", "target_entity_id": "f74e210776727dd2", "relation_type": "related_to", "text_evidence": "і проекти (роботи), практики, кваліфікаційна робота) Кількість кредитів Форма підсумкового ", "confidence": 0.901, "source": "proximity"}
|
| 31 |
+
{"relation_id": "c923c83b5ea29c28", "source_entity_id": "ceae3c879404fdc2", "target_entity_id": "986d05c4d03adaf6", "relation_type": "related_to", "text_evidence": "вачів вищої освіти: КОВИЛКІНА Катерина Олександрівна Представники роботодавців: МІХЄЄВ Іван Андрійович, Директор Східнорегіонального відділу ЕПАМ, к.т.н.", "confidence": 0.637, "source": "proximity"}
|
| 32 |
+
{"relation_id": "69b3c2d6cff3c530", "source_entity_id": "250d9f3642fd0543", "target_entity_id": "986d05c4d03adaf6", "relation_type": "related_to", "text_evidence": "вники роботодавців: МІХЄЄВ Іван Андрійович, Директор Східнорегіонального відділу ЕПАМ, к.т.н.", "confidence": 0.817, "source": "proximity"}
|
| 33 |
+
{"relation_id": "03f53825251f125a", "source_entity_id": "346996767a29a82b", "target_entity_id": "496bd95e14f270d0", "relation_type": "related_to", "text_evidence": "терна інженерія» за першим (бакалаврським) рівнем вищої освіти , затвердженого наказом МОН України від 19.11.2018 р. з", "confidence": 0.7989999999999999, "source": "proximity"}
|
| 34 |
+
{"relation_id": "7cdd54abd954181e", "source_entity_id": "346996767a29a82b", "target_entity_id": "8ef745e21a875f72", "relation_type": "related_to", "text_evidence": "терна інженерія» за першим (бакалаврським) рівнем вищої освіти , затвердженого наказом МОН України від 19.11.2018 р. за № 1262 2) Закону України «Про вищу освіту» від 01.07.2014 р. №", "confidence": 0.6699999999999999, "source": "proximity"}
|
| 35 |
+
{"relation_id": "0daf154286065906", "source_entity_id": "496bd95e14f270d0", "target_entity_id": "8ef745e21a875f72", "relation_type": "related_to", "text_evidence": "твердженого наказом МОН України від 19.11.2018 р. за № 1262 2) Закону України «Про вищу освіту» від 01.07.2014 р. №", "confidence": 0.871, "source": "proximity"}
|
| 36 |
+
{"relation_id": "7435c98d996ba049", "source_entity_id": "496bd95e14f270d0", "target_entity_id": "f9491e067e6094a3", "relation_type": "related_to", "text_evidence": "твердженого наказом МОН України від 19.11.2018 р. за № 1262 2) Закону України «Про вищу освіту» від 01.07.2014 р. № 1556-VII зі змінами та доповненнями. 3) Закону України «Про наукову і науково-техніч", "confidence": 0.592, "source": "proximity"}
|
| 37 |
+
{"relation_id": "0c6512abd04e5c61", "source_entity_id": "4608f9da19534fdf", "target_entity_id": "055172975b24840d", "relation_type": "related_to", "text_evidence": "калавра, одиничний, 240 кредитів ЄКТС, 3 роки 10 місяців на основі повної загальної середньої освіти Наявність акредитації Акредитована, Сертифікат про акредитацію видало Національне ", "confidence": 0.649, "source": "proximity"}
|
| 38 |
+
{"relation_id": "354d993bdaeae97e", "source_entity_id": "4608f9da19534fdf", "target_entity_id": "cf3cd7d2b4d5ff70", "relation_type": "related_to", "text_evidence": "калавра, одиничний, 240 кредитів ЄКТС, 3 роки 10 місяців на основі повної загальної середньої освіти Наявність акредитації Акредитована, Сертифікат про акредитацію видало Національне агентство із забе", "confidence": 0.547, "source": "proximity"}
|
| 39 |
+
{"relation_id": "0b2ce91a6ce02d9e", "source_entity_id": "055172975b24840d", "target_entity_id": "cf3cd7d2b4d5ff70", "relation_type": "related_to", "text_evidence": "тації Акредитована, Сертифікат про акредитацію видало Національне агентство із забезпечення якості вищої освіти, Україна, №302 від 14.05.202", "confidence": 0.898, "source": "proximity"}
|
| 40 |
+
{"relation_id": "93fcd2cb57adeced", "source_entity_id": "055172975b24840d", "target_entity_id": "4ce0475ef4976b1f", "relation_type": "related_to", "text_evidence": "тації Акредитована, Сертифікат про акредитацію видало Національне агентство із забезпечення якості вищої освіти, Україна, №302 від 14.05.2020 р. Перед", "confidence": 0.6910000000000001, "source": "proximity"}
|
| 41 |
+
{"relation_id": "9ef36ebf024d07b2", "source_entity_id": "cf3cd7d2b4d5ff70", "target_entity_id": "4ce0475ef4976b1f", "relation_type": "related_to", "text_evidence": " акредитацію видало Національне агентство із забезпечення якості вищої освіти, Україна, №302 від 14.05.2020 р. Перед", "confidence": 0.793, "source": "proximity"}
|
| 42 |
+
{"relation_id": "7f33aa246a13c403", "source_entity_id": "05fd3595bf5c36de", "target_entity_id": "5893788174cc1112", "relation_type": "related_to", "text_evidence": "ї їх компонентів; - інформаційні процеси, технології, методи, способи та системи автоматизованого та автоматичного проектування; налагодження, виробництва й експлуатації, проектна документація, станда", "confidence": 0.547, "source": "proximity"}
|
| 43 |
+
{"relation_id": "810228ff224457b9", "source_entity_id": "05fd3595bf5c36de", "target_entity_id": "19f4b16b62f7d2ac", "relation_type": "related_to", "text_evidence": "ї їх компонентів; - інформаційні процеси, технології, методи, способи та системи автоматизованого та автоматичного проектування; налагодження, виробництва й експлуатації, проектна документація, станда", "confidence": 0.478, "source": "proximity"}
|
| 44 |
+
{"relation_id": "a1adc15cbc6aa246", "source_entity_id": "5e1dde48ef03f452", "target_entity_id": "374bf375044352a6", "relation_type": "related_to", "text_evidence": "якими має оволодіти здобувач вищої освіти для застосовування ", "confidence": 0.973, "source": "proximity"}
|
| 45 |
+
{"relation_id": "2ecb94bf4fb18d73", "source_entity_id": "acdd1ddf99d6466e", "target_entity_id": "00cf239d3fdd3c0b", "relation_type": "related_to", "text_evidence": "п’ютерна інженерія, галузеві стандарти, навчальне навантаження, комп’ютерні системи, ІТ-індустрія, студентоцентроване", "confidence": 0.8049999999999999, "source": "proximity"}
|
| 46 |
+
{"relation_id": "fa3374d095a28939", "source_entity_id": "699ec9c1516b9078", "target_entity_id": "9696f2942cbcefbf", "relation_type": "related_to", "text_evidence": "сифікатора України: ДК 003:2010): 21 Професіонали в галузі фізичних, математичних та технічних наук; 213 Професіонали в", "confidence": 0.949, "source": "proximity"}
|
| 47 |
+
{"relation_id": "754434ad6bf3eb56", "source_entity_id": "699ec9c1516b9078", "target_entity_id": "bb0e47ca7d64c3a2", "relation_type": "related_to", "text_evidence": "сифікатора України: ДК 003:2010): 21 Професіонали в галузі фізичних, математичних та технічних наук; 213 Професіонали в галузі обчислень (комп'ютеризації); ", "confidence": 0.757, "source": "proximity"}
|
| 48 |
+
{"relation_id": "6b69a55ede8381b7", "source_entity_id": "699ec9c1516b9078", "target_entity_id": "76840deb548f6613", "relation_type": "related_to", "text_evidence": "сифікатора України: ДК 003:2010): 21 Професіонали в галузі фізичних, математичних та технічних наук; 213 Професіонали в галузі обчислень (комп'ютеризації); 2131 Професіонали в галузі обчислювальних си", "confidence": 0.589, "source": "proximity"}
|
| 49 |
+
{"relation_id": "5f930453b77961a3", "source_entity_id": "699ec9c1516b9078", "target_entity_id": "27539ade77e10aa5", "relation_type": "related_to", "text_evidence": "сифікатора України: ДК 003:2010): 21 Професіонали в галузі фізичних, математичних та технічних наук; 213 Професіонали в галузі обчислень (комп'ютеризації); 2131 Професіонали в галузі обчислювальних си", "confidence": 0.43899999999999995, "source": "proximity"}
|
| 50 |
+
{"relation_id": "db64775d768c3c84", "source_entity_id": "6b2065ed53799d5d", "target_entity_id": "8b39040d3506de46", "relation_type": "related_to", "text_evidence": " мереж і систем; 31 Технічні фахівці в галузі прикладних наук та техніки; 312 Технічні фахівці в галузі обчислювальної техніки; 3121 Техніки-програмісти.", "confidence": 0.661, "source": "proximity"}
|
| 51 |
+
{"relation_id": "8237923432f8e0ab", "source_entity_id": "f09ce6171ec29ca4", "target_entity_id": "8b39040d3506de46", "relation_type": "related_to", "text_evidence": "их наук та техніки; 312 Технічні фахівці в галузі обчислювальної техніки; 3121 Техніки-програмісти.", "confidence": 0.823, "source": "proximity"}
|
| 52 |
+
{"relation_id": "5a6a2af5c06950ce", "source_entity_id": "2f56f53dd8d0ca61", "target_entity_id": "a900f9c64ad40b8c", "relation_type": "related_to", "text_evidence": "Подальше навчання Бакалавр з комп’ютерної інженерії має право продовжити навчання за другим (магістерським) рів", "confidence": 0.7989999999999999, "source": "proximity"}
|
| 53 |
+
{"relation_id": "fbce2c37693d8296", "source_entity_id": "a900f9c64ad40b8c", "target_entity_id": "b8ade73e2d5d467f", "relation_type": "related_to", "text_evidence": "довжити навчання за другим (магістерським) рівнем освіти. 5 - Викладання та оцінювання Викладання та навчання Студентоцентроване, проблемно-орієнтоване навчання з набуттям загальних та професійних ком", "confidence": 0.544, "source": "proximity"}
|
| 54 |
+
{"relation_id": "6bf5e42e6094b4f3", "source_entity_id": "05094b9aa362a52d", "target_entity_id": "ffe98e727478892d", "relation_type": "related_to", "text_evidence": "одяться лекторами – професорами i доцентами, а також провідними науковцями або спеціалістами, запрошеними для читання лекцій. Лекції проводяться у відповідно обладнаних приміщеннях – аудиторіях для од", "confidence": 0.46599999999999997, "source": "proximity"}
|
| 55 |
+
{"relation_id": "baa4cd3169a66726", "source_entity_id": "d02c0c0e7726d3f1", "target_entity_id": "e90e15bb3ebc3b46", "relation_type": "related_to", "text_evidence": "рового забезпечення Усі науково-педагогічні працівники, залучені до реалізації освітньої складової освітньо-професійної програми (8 докторів наук та 25 кандидатів на", "confidence": 0.6699999999999999, "source": "proximity"}
|
| 56 |
+
{"relation_id": "eb8ff89ffc128240", "source_entity_id": "d02c0c0e7726d3f1", "target_entity_id": "948493e60578ae31", "relation_type": "related_to", "text_evidence": "рового забезпечення Усі науково-педагогічні працівники, залучені до реалізації освітньої складової освітньо-професійної програми (8 докторів наук та 25 кандидатів наук) є штатними співроб", "confidence": 0.613, "source": "proximity"}
|
| 57 |
+
{"relation_id": "9a81d6e0418686a8", "source_entity_id": "d02c0c0e7726d3f1", "target_entity_id": "09d7289e764aa019", "relation_type": "related_to", "text_evidence": "рового забезпечення Усі науково-педагогічні працівники, залучені до реалізації освітньої складової освітньо-професійної програми (8 докторів наук та 25 кандидатів наук) є штатними співробітниками Харк", "confidence": 0.472, "source": "proximity"}
|
| 58 |
+
{"relation_id": "a303cac3012bdd15", "source_entity_id": "e90e15bb3ebc3b46", "target_entity_id": "09d7289e764aa019", "relation_type": "related_to", "text_evidence": "офесійної програми (8 докторів наук та 25 кандидатів наук) є штатними співробітниками Харківського національного університету імені В. Каразіна та іноземними вищим", "confidence": 0.802, "source": "proximity"}
|
| 59 |
+
{"relation_id": "8fed7860d7a9154e", "source_entity_id": "e90e15bb3ebc3b46", "target_entity_id": "d62471d8c2e91d44", "relation_type": "related_to", "text_evidence": "офесійної програми (8 докторів наук та 25 кандидатів наук) є штатними співробітниками Харківського національного університету імені В. Каразіна та іноземними вищими навчальними закладами — партнерами,", "confidence": 0.619, "source": "proximity"}
|
| 60 |
+
{"relation_id": "0dd3bcb47316431c", "source_entity_id": "948493e60578ae31", "target_entity_id": "09d7289e764aa019", "relation_type": "related_to", "text_evidence": "(8 докторів наук та 25 кандидатів наук) є штатними співробітниками Харківського національного університету імені В. Каразіна та іноземними вищим", "confidence": 0.859, "source": "proximity"}
|
| 61 |
+
{"relation_id": "afc7997fe563b283", "source_entity_id": "948493e60578ae31", "target_entity_id": "d62471d8c2e91d44", "relation_type": "related_to", "text_evidence": "(8 докторів наук та 25 кандидатів наук) є штатними співробітниками Харківського національного університету імені В. Каразіна та іноземними вищими навчальними закладами — партнерами, можуть бути зарахо", "confidence": 0.6759999999999999, "source": "proximity"}
|
| 62 |
+
{"relation_id": "0099e07214500587", "source_entity_id": "b595cc5c5dbc9f27", "target_entity_id": "8903da6c47c5b405", "relation_type": "related_to", "text_evidence": ", за якою кафедрою (спеціальністю) присвоєно Керівник робочої групи – гарант ОП Толстолузька Олена Геннадіївна професор кафедри ко", "confidence": 0.8200000000000001, "source": "proximity"}
|
| 63 |
+
{"relation_id": "2fd1149ea74ce3d3", "source_entity_id": "ae903905b511ec19", "target_entity_id": "986d05c4d03adaf6", "relation_type": "related_to", "text_evidence": "вачів вищої освіти: СОЛЯНИК Юрій Вячеславович П��едставники роботодавців: МІХЄЄВ Іван Андрійович, Директор Східнорегіонального відділу ЕПАМ, к.т.н.", "confidence": 0.658, "source": "proximity"}
|
| 64 |
+
{"relation_id": "5003eee7916c5468", "source_entity_id": "cf3cd7d2b4d5ff70", "target_entity_id": "0c36a78449184e85", "relation_type": "related_to", "text_evidence": " акредитацію видало Національне агентство із забезпечення якості вищої освіти, Україна, № 1603 від 21.05.2021, № 5372 ", "confidence": 0.787, "source": "proximity"}
|
| 65 |
+
{"relation_id": "d576437e1c6576ee", "source_entity_id": "cf3cd7d2b4d5ff70", "target_entity_id": "1a39e93b2f96d97a", "relation_type": "related_to", "text_evidence": " акредитацію видало Національне агентство із забезпечення якості вищої освіти, Україна, № 1603 від 21.05.2021, № 5372 від 06.07.2", "confidence": 0.763, "source": "proximity"}
|
| 66 |
+
{"relation_id": "6a8f1c9da4a33d43", "source_entity_id": "cf3cd7d2b4d5ff70", "target_entity_id": "dfa8525adddea21c", "relation_type": "related_to", "text_evidence": " акредитацію видало Національне агентство із забезпечення якості вищої освіти, Україна, № 1603 від 21.05.2021, № 5372 від 06.07.2023 Передумо", "confidence": 0.718, "source": "proximity"}
|
| 67 |
+
{"relation_id": "bb3c5744b698f29f", "source_entity_id": "cf3cd7d2b4d5ff70", "target_entity_id": "0e97e28b90683458", "relation_type": "related_to", "text_evidence": " акредитацію видало Національне агентство із забезпечення якості вищої освіти, Україна, № 1603 від 21.05.2021, № 5372 від 06.07.2023 Передумови Для здобуття освітнього ступеня бакалавра можуть в", "confidence": 0.673, "source": "proximity"}
|
| 68 |
+
{"relation_id": "ae3bad14d80885db", "source_entity_id": "0c36a78449184e85", "target_entity_id": "1a39e93b2f96d97a", "relation_type": "related_to", "text_evidence": "освіти, Україна, № 1603 від 21.05.2021, № 5372 від 06.07.2", "confidence": 0.976, "source": "proximity"}
|
| 69 |
+
{"relation_id": "19d147fecc3369ff", "source_entity_id": "1a39e93b2f96d97a", "target_entity_id": "dfa8525adddea21c", "relation_type": "related_to", "text_evidence": "Україна, № 1603 від 21.05.2021, № 5372 від 06.07.2023 Передумо", "confidence": 0.955, "source": "proximity"}
|
| 70 |
+
{"relation_id": "c2a22a5d7c605ed2", "source_entity_id": "1a39e93b2f96d97a", "target_entity_id": "0e97e28b90683458", "relation_type": "related_to", "text_evidence": "Україна, № 1603 від 21.05.2021, № 5372 від 06.07.2023 Передумови Для здобуття освітнього ступеня бакалавра можуть в", "confidence": 0.91, "source": "proximity"}
|
| 71 |
+
{"relation_id": "6ea58ca9091d2a53", "source_entity_id": "b88bcdbd0c134780", "target_entity_id": "43f1fa06d0fb0bd7", "relation_type": "related_to", "text_evidence": "фесійної діяльності бакалаврів є: технічне, програмне, математ", "confidence": 0.958, "source": "proximity"}
|
| 72 |
+
{"relation_id": "948778fc99b44bc9", "source_entity_id": "b88bcdbd0c134780", "target_entity_id": "3159a3e3e182afdb", "relation_type": "related_to", "text_evidence": "фесійної діяльності бакалаврів є: технічне, програмне, математичне, інформаційне та ор", "confidence": 0.895, "source": "proximity"}
|
| 73 |
+
{"relation_id": "822ce990167332ba", "source_entity_id": "5e1dde48ef03f452", "target_entity_id": "0969f1b2971aa784", "relation_type": "related_to", "text_evidence": "дики та технології: здобувач має оволодіти методами та програмними засобами моделювання, проектування, авто", "confidence": 0.832, "source": "proximity"}
|
| 74 |
+
{"relation_id": "e0641c75aeb51efa", "source_entity_id": "5e1dde48ef03f452", "target_entity_id": "4c8642012c1b3a70", "relation_type": "related_to", "text_evidence": "дики та технології: здобувач має оволодіти методами та програмними засобами моделювання, проектування, автоматизованого у", "confidence": 0.793, "source": "proximity"}
|
| 75 |
+
{"relation_id": "c396cfc7365de42b", "source_entity_id": "5e1dde48ef03f452", "target_entity_id": "5d79062899f094ab", "relation_type": "related_to", "text_evidence": "дики та технології: здобувач має оволодіти методами та програмними засобами моделювання, проектування, автоматизованого управління складними організаційно-технічними об’єктами, інформаційними технолог", "confidence": 0.529, "source": "proximity"}
|
| 76 |
+
{"relation_id": "1bbe2a3c156da82e", "source_entity_id": "5e1dde48ef03f452", "target_entity_id": "d9262d20fe39c685", "relation_type": "related_to", "text_evidence": "дики та технології: здобувач має оволодіти методами та програмними засобами моделювання, проектування, автоматизованого управління складними організаційно-технічними об’єктами, інформаційними технолог", "confidence": 0.44199999999999995, "source": "proximity"}
|
| 77 |
+
{"relation_id": "b0409bc3469429cb", "source_entity_id": "2e8b38e56500a7a3", "target_entity_id": "49f9040d6c533349", "relation_type": "related_to", "text_evidence": "ичні задачі в сфері інженерії, будівництва та виробництва, автоматизації процесів управління та робототехніки. Основний фокус освітньої програми та спеціалізації Програма спрямована на підготовку висо", "confidence": 0.472, "source": "proximity"}
|
| 78 |
+
{"relation_id": "32a0fd4983d5f004", "source_entity_id": "c8c2ed4483d9e387", "target_entity_id": "49f9040d6c533349", "relation_type": "related_to", "text_evidence": "цесів управління та робототехніки. Основний фокус освітньої програми та спеціалізації Програма спрямована на підготовку висококваліфікованих кадрів в галузі електронік", "confidence": 0.7, "source": "proximity"}
|
| 79 |
+
{"relation_id": "bebe0d3465426933", "source_entity_id": "49f9040d6c533349", "target_entity_id": "6a877c5c22acd65d", "relation_type": "related_to", "text_evidence": "ована на підготовку висококваліфікованих кадрів в галузі електроніки, автоматизації, еле", "confidence": 0.889, "source": "proximity"}
|
| 80 |
+
{"relation_id": "8f0826a7790938c4", "source_entity_id": "699ec9c1516b9078", "target_entity_id": "9bd3d8195311b86a", "relation_type": "related_to", "text_evidence": "сифікатора України: ДК 003:2010): 21 Професіонали в галузі фізичних, математичних та технічних наук; 213 Професіонали в галузі обчислень (комп'ютеризації); 2131 Професіонали в галузі обчислювальних си", "confidence": 0.577, "source": "proximity"}
|
| 81 |
+
{"relation_id": "62113b1c7dcea37d", "source_entity_id": "86fd6e5fe8e9acc5", "target_entity_id": "05094b9aa362a52d", "relation_type": "related_to", "text_evidence": " навчання на основі підручників та конспектів, консультації з викладачами. Лекції (мультимедійної, інтерактивні) проводяться лекторами – професорами i доцентами, а тако", "confidence": 0.649, "source": "proximity"}
|
| 82 |
+
{"relation_id": "eb27ddb47dd9c289", "source_entity_id": "730864a279fa38ed", "target_entity_id": "ca36425b9fea1047", "relation_type": "related_to", "text_evidence": "вових документів та міжнародних стандартів. Вміти використовувати різноманітне спеціалізоване програмне забезпечення для розв’язування типових інженерних задач у галузі автоматизації, зокрема, математ", "confidence": 0.481, "source": "proximity"}
|
| 83 |
+
{"relation_id": "0b9ab1d76e347c91", "source_entity_id": "730864a279fa38ed", "target_entity_id": "eccdae69f659f373", "relation_type": "related_to", "text_evidence": "вових документів та міжнародних стандартів. Вміти використовувати різноманітне спеціалізоване програмне забезпечення для розв’язування типових інженерних задач у галузі автоматизації, зокрема, математ", "confidence": 0.4, "source": "proximity"}
|
| 84 |
+
{"relation_id": "b6071be7bc5b51d8", "source_entity_id": "43c51e4e3af97e3b", "target_entity_id": "9d185bb8bfa967bc", "relation_type": "related_to", "text_evidence": "явності ID картки); військовий квиток або приписне посвідчення (для юнаків); довідка ДПА про присвоєння реєс", "confidence": 0.8049999999999999, "source": "proximity"}
|
| 85 |
+
{"relation_id": "b6bc42683b85ef62", "source_entity_id": "5c3c6ed4d533a2c1", "target_entity_id": "cef1dffeff86b2ad", "relation_type": "related_to", "text_evidence": "о інтелекту 2025 / 2026 навчальний рік 2 Прог��аму рекомендовано до затвердження Вченою радою ННІ комп’ютерних наук та штучного інтелекту «23» вересня 2025 року, протокол № 2 РОЗРОБНИКИ ПРОГРАМИ", "confidence": 0.577, "source": "proximity"}
|
| 86 |
+
{"relation_id": "05f368d71978e467", "source_entity_id": "581d47447ce9c3f4", "target_entity_id": "cef1dffeff86b2ad", "relation_type": "related_to", "text_evidence": "лекту «23» вересня 2025 року, протокол № 2 РОЗРОБНИКИ ПРОГРАМИ", "confidence": 0.97, "source": "proximity"}
|
| 87 |
+
{"relation_id": "b4967a922fbf07fd", "source_entity_id": "581d47447ce9c3f4", "target_entity_id": "b0c9f122b4dde681", "relation_type": "related_to", "text_evidence": "лекту «23» вересня 2025 року, протокол № 2 РОЗРОБНИКИ ПРОГРАМИ: кандидат технічних наук, доцент, доцент кафедри математичного моделювання та аналізу даних Коробчинський Кирил", "confidence": 0.748, "source": "proximity"}
|
| 88 |
+
{"relation_id": "59b1daeed58a9049", "source_entity_id": "581d47447ce9c3f4", "target_entity_id": "c9fbd3e8cd885042", "relation_type": "related_to", "text_evidence": "лекту «23» вересня 2025 року, протокол № 2 РОЗРОБНИКИ ПРОГРАМИ: кандидат технічних наук, доцент, доцент кафедри математичного моделювання та аналізу даних Коробчинський Кирил Петрович; Програму схвале", "confidence": 0.595, "source": "proximity"}
|
| 89 |
+
{"relation_id": "4020539cfad5bc47", "source_entity_id": "cef1dffeff86b2ad", "target_entity_id": "b0c9f122b4dde681", "relation_type": "related_to", "text_evidence": " вересня 2025 року, протокол № 2 РОЗРОБНИКИ ПРОГРАМИ: кандидат технічних наук, доцент, доцент кафедри математичного моделювання та аналізу даних Коробчинський Кирил", "confidence": 0.778, "source": "proximity"}
|
| 90 |
+
{"relation_id": "87602fa66a983d05", "source_entity_id": "cef1dffeff86b2ad", "target_entity_id": "c9fbd3e8cd885042", "relation_type": "related_to", "text_evidence": " вересня 2025 року, протокол № 2 РОЗРОБНИКИ ПРОГРАМИ: кандидат технічних наук, доцент, доцент кафедри математичного моделювання та аналізу даних Коробчинський Кирил Петрович; Програму схвалено ", "confidence": 0.625, "source": "proximity"}
|
| 91 |
+
{"relation_id": "1ea10d42276726f5", "source_entity_id": "b0c9f122b4dde681", "target_entity_id": "c9fbd3e8cd885042", "relation_type": "related_to", "text_evidence": "аук, доцент, доцент кафедри математичного моделювання та аналізу даних Коробчинський Кирил Петрович; Програму схвалено ", "confidence": 0.847, "source": "proximity"}
|
| 92 |
+
{"relation_id": "e84de7c1dd3166ac", "source_entity_id": "b0c9f122b4dde681", "target_entity_id": "48c60bc68fad6568", "relation_type": "related_to", "text_evidence": "аук, доцент, доцент кафедри математичного моделювання та аналізу даних Коробчинський Кирил Петрович; Програму схвалено на засіданні кафедри математичного моделювання та аналізу даних Протокол від «28»", "confidence": 0.403, "source": "proximity"}
|
| 93 |
+
{"relation_id": "0375ada8d4d7d7b5", "source_entity_id": "c9fbd3e8cd885042", "target_entity_id": "48c60bc68fad6568", "relation_type": "related_to", "text_evidence": "ня та аналізу даних Коробчинський Кирил Петрович; Програму схвалено на засіданні кафедри математичного моделювання та аналізу даних Протокол від «28» серпня 2025 року, протокол № 1 Завідувач кафедри м", "confidence": 0.556, "source": "proximity"}
|
| 94 |
+
{"relation_id": "ac09f38ff0a58dbb", "source_entity_id": "48c60bc68fad6568", "target_entity_id": "d9e6447f15698416", "relation_type": "related_to", "text_evidence": "» серпня 2025 року, протокол № 1 Завідувач кафедри математичного моделювання та аналізу даних Володимир СТРУКОВ Програму погоджено ", "confidence": 0.778, "source": "proximity"}
|
| 95 |
+
{"relation_id": "efd565e1df22caf7", "source_entity_id": "58293efde3d65766", "target_entity_id": "9911b886b4e40ea5", "relation_type": "related_to", "text_evidence": "«Комп'ютерні науки» Віталіна БАБЕНКО Прог��аму погоджено науково-методичною комісією навчально-наукового інституту комп’ютерних наук та штучного інтелекту Протокол від 08 вересня 2025 року, протокол № ", "confidence": 0.5589999999999999, "source": "proximity"}
|
| 96 |
+
{"relation_id": "f43496ba82f03ff6", "source_entity_id": "9911b886b4e40ea5", "target_entity_id": "212a4cbee48972fb", "relation_type": "related_to", "text_evidence": "електу Протокол від 08 вересня 2025 року, протокол № 1 Голова науково-методичної комісії навчально-наукового інституту комп’ютерних наук та штучного інтелекту Євгеній ПОКЛОНСЬКИЙ 3 ВСТУП Програма на", "confidence": 0.583, "source": "proximity"}
|
| 97 |
+
{"relation_id": "d1e1b636e571d75d", "source_entity_id": "97cd6dc5749e6b61", "target_entity_id": "c0c976dc9f08ee09", "relation_type": "related_to", "text_evidence": "амування, володіння алгоритмічним мисленням, методами програмної інженерії для реалізації програмного забезпечення з урахуванням вимог до його якості, надійності, виробничих характеристик. 4 ФК03. Зда", "confidence": 0.487, "source": "proximity"}
|
| 98 |
+
{"relation_id": "8a39e2236d4c37a0", "source_entity_id": "c0c976dc9f08ee09", "target_entity_id": "97722021b1a2792b", "relation_type": "related_to", "text_evidence": "их характеристик. 4 ФК03. Здатність до логічного мислення, побудови логічних ", "confidence": 0.9430000000000001, "source": "proximity"}
|
| 99 |
+
{"relation_id": "e4d5d5515d4c1c7d", "source_entity_id": "c41170818cb6c11a", "target_entity_id": "7d7502050cc53cdd", "relation_type": "related_to", "text_evidence": "глядаються принципи ACID (атомарність, узгодженість, ізольованість, довговічність) та їх реалізація у різних СУБД.", "confidence": 0.733, "source": "proximity"}
|
| 100 |
+
{"relation_id": "7c7a3e2d4b622fb8", "source_entity_id": "9046fecf16ca5776", "target_entity_id": "7d7502050cc53cdd", "relation_type": "related_to", "text_evidence": "ться принципи ACID (атомарність, узгодженість, ізольованість, довговічність) та їх реалізація у різних СУБД.", "confidence": 0.751, "source": "proximity"}
|
| 101 |
+
{"relation_id": "2ef649d9a2121b1a", "source_entity_id": "60d10086ca90e035", "target_entity_id": "7d7502050cc53cdd", "relation_type": "related_to", "text_evidence": " ACID (атомарність, узгодженість, ізольованість, довговічність) та їх реалізація у різних СУБД.", "confidence": 0.79, "source": "proximity"}
|
| 102 |
+
{"relation_id": "412fe0bf773c7fa6", "source_entity_id": "d82a8d307fd8eb65", "target_entity_id": "7d7502050cc53cdd", "relation_type": "related_to", "text_evidence": "ість, узгодженість, ізольованість, довговічність) та їх реалізація у різних СУБД.", "confidence": 0.832, "source": "proximity"}
|
| 103 |
+
{"relation_id": "2621f9ac9c6f41f6", "source_entity_id": "cf56ebddf5a35607", "target_entity_id": "7d7502050cc53cdd", "relation_type": "related_to", "text_evidence": "сть, ізольованість, довговічність) та їх реалізація у різних СУБД.", "confidence": 0.877, "source": "proximity"}
|
| 104 |
+
{"relation_id": "d1bf6de014b1a098", "source_entity_id": "70fbedafe4c03090", "target_entity_id": "6e95386ba46783ae", "relation_type": "related_to", "text_evidence": "гери в базах даних. 12 3 2 2 5 Тема 11. Транзакції та захист даних. 12 3 2 2 5 Разом за розділом 4 24 6 4 4 10 Усього годин за II семестр 120 32 16 16 0 56 У", "confidence": 0.679, "source": "proximity"}
|
| 105 |
+
{"relation_id": "20021e154ce39909", "source_entity_id": "6e95386ba46783ae", "target_entity_id": "7da8314b0a04d391", "relation_type": "related_to", "text_evidence": " 10 Усього годин за II семестр 120 32 16 16 0 56 Усьог", "confidence": 0.967, "source": "proximity"}
|
| 106 |
+
{"relation_id": "04411e091d8807e7", "source_entity_id": "6e95386ba46783ae", "target_entity_id": "dd9e68038557a4d1", "relation_type": "related_to", "text_evidence": " 10 Усього годин за II семестр 120 32 16 16 0 56 Усього годин 120 32", "confidence": 0.967, "source": "proximity"}
|
| 107 |
+
{"relation_id": "c0ae320e2a6e9b68", "source_entity_id": "00cd6ec8a2ea3f44", "target_entity_id": "6e95386ba46783ae", "relation_type": "related_to", "text_evidence": "7 Вивчення DCL та робота з тригерами. 2 Усього годин за II семестр 16", "confidence": 0.838, "source": "proximity"}
|
| 108 |
+
{"relation_id": "feab69d781139a38", "source_entity_id": "b18f2b51c06200ef", "target_entity_id": "79f52309b208feba", "relation_type": "related_to", "text_evidence": "а у відповідності з технічним завданням за вказаний час – студент отримує 100% від ма", "confidence": 0.886, "source": "proximity"}
|
| 109 |
+
{"relation_id": "a7b052c5c4094b57", "source_entity_id": "79f52309b208feba", "target_entity_id": "b18f2b51c06200ef", "relation_type": "related_to", "text_evidence": "м за вказаний час – студент отримує 100% від максимальної кількості балів; • робота була виконана у відповідності з технічним завданням с запізненням – студент отримує 60 % від максимальної кількості ", "confidence": 0.43300000000000005, "source": "proximity"}
|
| 110 |
+
{"relation_id": "26fe6ace5996a9ce", "source_entity_id": "355b77129964e5ba", "target_entity_id": "0e25ef31b4529625", "relation_type": "related_to", "text_evidence": "ості у відповіді; • 0 балів – у випадку значної неточності, помилки, або відсутності відповіді. Практичні завдання оцінюються таким чи", "confidence": 0.772, "source": "proximity"}
|
| 111 |
+
{"relation_id": "b33c74c18a0ba24d", "source_entity_id": "7a34d4c3a4e2a5d9", "target_entity_id": "5af4e9f79989b6cc", "relation_type": "related_to", "text_evidence": " : навч. посібник / А. Пасічник. – Львів : Магнолія- 2006, 2012. – 584 ", "confidence": 0.931, "source": "proximity"}
|
| 112 |
+
{"relation_id": "aa10a54e8f60ccbe", "source_entity_id": "7a34d4c3a4e2a5d9", "target_entity_id": "c2cb6858473acfca", "relation_type": "related_to", "text_evidence": " : навч. посібник / А. Пасічник. – Львів : Магнолія- 2006, 2012. – 584 с.", "confidence": 0.883, "source": "proximity"}
|
| 113 |
+
{"relation_id": "2558105da4ea9930", "source_entity_id": "5af4e9f79989b6cc", "target_entity_id": "c2cb6858473acfca", "relation_type": "related_to", "text_evidence": "Пасічник. – Львів : Магнолія- 2006, 2012. – 584 с.", "confidence": 0.952, "source": "proximity"}
|
| 114 |
+
{"relation_id": "f7efd863ec947966", "source_entity_id": "6f9322cda2097382", "target_entity_id": "fb8e867f51196d1e", "relation_type": "related_to", "text_evidence": "них: навч. посібник/О.Г.Трофименко, Ю.В.Прокоп, Н.І. Копитчук. 2-ге вид. виправ. і доповн. – Одеса: Фенікс, 2019.–246с.", "confidence": 0.76, "source": "proximity"}
|
| 115 |
+
{"relation_id": "1f4a0b6fc3a89bb4", "source_entity_id": "6f9322cda2097382", "target_entity_id": "ce706d1201dd22bb", "relation_type": "related_to", "text_evidence": "них: навч. посібник/О.Г.Трофименко, Ю.В.Прокоп, Н.І. Копитчук. 2-ге вид. виправ. і доповн. – Одеса: Фенікс, 2019.–246с.", "confidence": 0.736, "source": "proximity"}
|
| 116 |
+
{"relation_id": "5540ed313f4901b1", "source_entity_id": "50c3bc64c15d6a7b", "target_entity_id": "fb8e867f51196d1e", "relation_type": "related_to", "text_evidence": "ник/О.Г.Трофименко, Ю.В.Прокоп, Н.І. Копитчук. 2-ге вид. виправ. і доповн. – Одеса: Фенікс, 2019.–246с.", "confidence": 0.808, "source": "proximity"}
|
| 117 |
+
{"relation_id": "78c676fc1406b02d", "source_entity_id": "50c3bc64c15d6a7b", "target_entity_id": "ce706d1201dd22bb", "relation_type": "related_to", "text_evidence": "ник/О.Г.Трофименко, Ю.В.Прокоп, Н.І. Копитчук. 2-ге вид. виправ. і доповн. – Одеса: Фенікс, 2019.–246с.", "confidence": 0.784, "source": "proximity"}
|
| 118 |
+
{"relation_id": "caa465b0f27b01c5", "source_entity_id": "07624d428730f41a", "target_entity_id": "fb8e867f51196d1e", "relation_type": "related_to", "text_evidence": "именко, Ю.В.Прокоп, Н.І. Копитчук. 2-ге вид. виправ. і доповн. – Одеса: Фенікс, 2019.–246с.", "confidence": 0.844, "source": "proximity"}
|
| 119 |
+
{"relation_id": "1900118d1c3e880f", "source_entity_id": "07624d428730f41a", "target_entity_id": "ce706d1201dd22bb", "relation_type": "related_to", "text_evidence": "именко, Ю.В.Прокоп, Н.І. Копитчук. 2-ге вид. виправ. і доповн. – Одеса: Фенікс, 2019.–246с.", "confidence": 0.8200000000000001, "source": "proximity"}
|
| 120 |
+
{"relation_id": "76b3989aa542aa43", "source_entity_id": "fb8e867f51196d1e", "target_entity_id": "ce706d1201dd22bb", "relation_type": "related_to", "text_evidence": " і доповн. – Одеса: Фенікс, 2019.–246с.", "confidence": 0.976, "source": "proximity"}
|
| 121 |
+
{"relation_id": "10b40cb044f75344", "source_entity_id": "991851bc4eb66ce7", "target_entity_id": "cef1dffeff86b2ad", "relation_type": "related_to", "text_evidence": "о інтелекту 2024 / 2025 навчальний рік Програму обговорено та рекомендовано до затвердження Вченою радою Навчально-наукового інституту комп’ютерних наук та штучного інтелекту “ 23” _вересня_ 2024 року", "confidence": 0.45399999999999996, "source": "proximity"}
|
| 122 |
+
{"relation_id": "05309890bdf24f38", "source_entity_id": "0f8293abf49fb235", "target_entity_id": "cef1dffeff86b2ad", "relation_type": "related_to", "text_evidence": "ту “ 23” _вересня_ 2024 року, протокол № 2 РОЗРОБНИКИ ПРОГРАМИ", "confidence": 0.97, "source": "proximity"}
|
| 123 |
+
{"relation_id": "6170a88660bfd8ed", "source_entity_id": "0f8293abf49fb235", "target_entity_id": "16a58157ceda2854", "relation_type": "related_to", "text_evidence": "ту “ 23” _вересня_ 2024 року, протокол № 2 РОЗРОБНИКИ ПРОГРАМИ: ДОНЕЦЬ Володимир Віталійович, Ph.D.", "confidence": 0.868, "source": "proximity"}
|
| 124 |
+
{"relation_id": "e0ec7e443024f4ec", "source_entity_id": "cef1dffeff86b2ad", "target_entity_id": "16a58157ceda2854", "relation_type": "related_to", "text_evidence": "вересня_ 2024 року, протокол № 2 РОЗРОБНИКИ ПРОГРАМИ: ДОНЕЦЬ Володимир Віталійович, Ph.D.", "confidence": 0.898, "source": "proximity"}
|
| 125 |
+
{"relation_id": "a05c1a4c71c3ea5e", "source_entity_id": "b0c9f122b4dde681", "target_entity_id": "01cb9d90b7f7b0fb", "relation_type": "related_to", "text_evidence": "r Science, викладач кафедри математичного моделювання та аналізу даних; МІШИН Олександр Вікторович, ст. викладач кафед", "confidence": 0.844, "source": "proximity"}
|
| 126 |
+
{"relation_id": "492b2d01ac13228d", "source_entity_id": "01cb9d90b7f7b0fb", "target_entity_id": "0c32e3788d1919fe", "relation_type": "related_to", "text_evidence": "я та аналізу даних; МІШИН Олександр Вікторович, ст. викладач кафедри математичного моделювання та аналізу даних; Програму схвалено на засіданні кафедри математичного моделювання та аналізу даних Прото", "confidence": 0.43300000000000005, "source": "proximity"}
|
| 127 |
+
{"relation_id": "ccd10b9894544735", "source_entity_id": "0c32e3788d1919fe", "target_entity_id": "7f6f05d5d2de017e", "relation_type": "related_to", "text_evidence": "даних Протокол від «13» вересня 2024 року, протокол № 3 В.о. завідувача каф", "confidence": 0.931, "source": "proximity"}
|
| 128 |
+
{"relation_id": "3aafb72b5a4ced1a", "source_entity_id": "0c32e3788d1919fe", "target_entity_id": "d9e6447f15698416", "relation_type": "related_to", "text_evidence": "даних Протокол від «13» вересня 2024 року, протокол № 3 В.о. завідувача кафедри математичного моделювання та аналізу даних Володимир СТРУКОВ Програму погоджено ", "confidence": 0.6910000000000001, "source": "proximity"}
|
| 129 |
+
{"relation_id": "e472e40d1a216c0f", "source_entity_id": "0c32e3788d1919fe", "target_entity_id": "319103bce9ac4ecb", "relation_type": "related_to", "text_evidence": "даних Протокол від «13» вересня 2024 року, протокол № 3 В.о. завідувача кафедри математичного моделювання та аналізу даних Володимир СТРУКОВ Програму погоджено з гарантом освітньої програми «Комп'ютер", "confidence": 0.487, "source": "proximity"}
|
| 130 |
+
{"relation_id": "eaa1097a8e5763c4", "source_entity_id": "7f6f05d5d2de017e", "target_entity_id": "d9e6447f15698416", "relation_type": "related_to", "text_evidence": " вересня 2024 року, протокол № 3 В.о. завідувача кафедри математичного моделювання та аналізу даних Володимир СТРУКОВ Програму погоджено ", "confidence": 0.76, "source": "proximity"}
|
| 131 |
+
{"relation_id": "42dbbe1e74ad11ad", "source_entity_id": "7f6f05d5d2de017e", "target_entity_id": "319103bce9ac4ecb", "relation_type": "related_to", "text_evidence": " вересня 2024 року, протокол № 3 В.о. завідувача кафедри математичного моделювання та аналізу даних Володимир СТРУКОВ Програму погоджено з гарантом освітньої програми «Комп'ютерні науки» Гарант освітн", "confidence": 0.556, "source": "proximity"}
|
| 132 |
+
{"relation_id": "be68bfd5172550ae", "source_entity_id": "d9e6447f15698416", "target_entity_id": "319103bce9ac4ecb", "relation_type": "related_to", "text_evidence": "ня та аналізу даних Володимир СТРУКОВ Програму погоджено з гарантом освітньої програми «Комп'ютерні науки» Гарант освітньої п", "confidence": 0.796, "source": "proximity"}
|
| 133 |
+
{"relation_id": "99b7ddb0d2ae0ec6", "source_entity_id": "319103bce9ac4ecb", "target_entity_id": "ca41d203d504603f", "relation_type": "related_to", "text_evidence": "освітньої програми «Комп'ютерні науки» Гарант освітньої програми «Комп'ютерні науки» Оксана ПОДОЛЯКА Програму погоджено ", "confidence": 0.8049999999999999, "source": "proximity"}
|
| 134 |
+
{"relation_id": "ec99605cdf843d87", "source_entity_id": "ca41d203d504603f", "target_entity_id": "0c32e3788d1919fe", "relation_type": "related_to", "text_evidence": "«Комп'ютерні науки» Оксана ПОДОЛЯКА Програму погоджено науково-методичною комісією навчально-наукового інституту комп’ютерних наук та штучного інтелекту Протокол від 23 вересня 2024 року, протокол № 2", "confidence": 0.562, "source": "proximity"}
|
| 135 |
+
{"relation_id": "4ae4ed137a4d9369", "source_entity_id": "ca41d203d504603f", "target_entity_id": "cef1dffeff86b2ad", "relation_type": "related_to", "text_evidence": "«Комп'ютерні науки» Оксана ПОДОЛЯКА Програму погоджено науково-методичною комісією навчально-наукового інституту комп’ютерних наук та штучного інтелекту Протокол від 23 вересня 2024 року, протокол № 2", "confidence": 0.496, "source": "proximity"}
|
| 136 |
+
{"relation_id": "f7120c76510ce7d3", "source_entity_id": "0c32e3788d1919fe", "target_entity_id": "cef1dffeff86b2ad", "relation_type": "related_to", "text_evidence": "електу Протокол від 23 вересня 2024 року, протокол № 2 Голова науково-мето", "confidence": 0.9339999999999999, "source": "proximity"}
|
| 137 |
+
{"relation_id": "13018129d475aae0", "source_entity_id": "0c32e3788d1919fe", "target_entity_id": "212a4cbee48972fb", "relation_type": "related_to", "text_evidence": "електу Протокол від 23 вересня 2024 року, протокол № 2 Голова науково-методичної комісії навчально-наукового інституту комп’ютерних наук та штучного інтелекту Євгеній ПОКЛОНСЬКИЙ ВСТУП Програма навч", "confidence": 0.583, "source": "proximity"}
|
| 138 |
+
{"relation_id": "db23fc3e2b36db6c", "source_entity_id": "cef1dffeff86b2ad", "target_entity_id": "212a4cbee48972fb", "relation_type": "related_to", "text_evidence": " вересня 2024 року, протокол № 2 Голова науково-методичної комісії навчально-наукового інституту комп’ютерних наук та штучного інтелекту Євгеній ПОКЛОНСЬКИЙ ВСТУП Програма навч", "confidence": 0.649, "source": "proximity"}
|
| 139 |
+
{"relation_id": "98c8643587227d0a", "source_entity_id": "8ea18a7337f8bebc", "target_entity_id": "79f52309b208feba", "relation_type": "related_to", "text_evidence": "ідвідування лекцій: 5 балів: студент відвідав 90 - 100 %", "confidence": 0.973, "source": "proximity"}
|
| 140 |
+
{"relation_id": "36bcfc3e5afddc51", "source_entity_id": "79f52309b208feba", "target_entity_id": "ed1e551c2e2c23f4", "relation_type": "related_to", "text_evidence": "ня лекцій: 5 балів: студент відвідав 90 - 100 % лекційних занять; 4 бали: студент відвідав 6", "confidence": 0.862, "source": "proximity"}
|
| 141 |
+
{"relation_id": "405c73134a7756f7", "source_entity_id": "79f52309b208feba", "target_entity_id": "e875322dc2569a22", "relation_type": "related_to", "text_evidence": "ня лекцій: 5 балів: студент відвідав 90 - 100 % лекційних занять; 4 бали: студент відвідав 66 - 89 % лекційних занять; 3 бали: студент відвідав 41 - 65 % лекційних занять; 2 балів: студент відвідав 2", "confidence": 0.544, "source": "proximity"}
|
| 142 |
+
{"relation_id": "469c6fd14239903c", "source_entity_id": "79f52309b208feba", "target_entity_id": "784c19bb97840e78", "relation_type": "related_to", "text_evidence": "ії оцінювання знань студентів під час підсумкового контролю 16-20 балів: - студент демонструє глибоке розуміння теми питання; - студент повністю розкриває сутність питання; - в роботі наведені приклад", "confidence": 0.529, "source": "proximity"}
|
| 143 |
+
{"relation_id": "27e406a7969a0a92", "source_entity_id": "782aee541767578b", "target_entity_id": "ce706d1201dd22bb", "relation_type": "related_to", "text_evidence": "g Text with Python. Manning Publications, 2019.", "confidence": 0.9339999999999999, "source": "proximity"}
|
| 144 |
+
{"relation_id": "ec84ea7ba9f90abe", "source_entity_id": "8e0e8ac292079e8f", "target_entity_id": "ec472be46def442f", "relation_type": "related_to", "text_evidence": "есурс]. Copyright © 1993, 1994, Nikos Drakos, Computer Based Lea", "confidence": 0.964, "source": "proximity"}
|
| 145 |
+
{"relation_id": "cade74c4cb6a3548", "source_entity_id": "8e0e8ac292079e8f", "target_entity_id": "a156e5e34505bff7", "relation_type": "related_to", "text_evidence": "есурс]. Copyright © 1993, 1994, Nikos Drakos, Computer Based Learning Unit, University of Leed", "confidence": 0.922, "source": "proximity"}
|
| 146 |
+
{"relation_id": "5503d414043847a3", "source_entity_id": "8e0e8ac292079e8f", "target_entity_id": "8ae61549fb9dbd7f", "relation_type": "related_to", "text_evidence": "есурс]. Copyright © 1993, 1994, Nikos Drakos, Computer Based Learning Unit, University of Leeds. URL: https://pdfs.", "confidence": 0.832, "source": "proximity"}
|
| 147 |
+
{"relation_id": "90ebf5e804f87049", "source_entity_id": "8e0e8ac292079e8f", "target_entity_id": "e9177cbaa806b0f2", "relation_type": "related_to", "text_evidence": "есурс]. Copyright © 1993, 1994, Nikos Drakos, Computer Based Learning Unit, University of Leeds. URL: https://pdfs.semanticscholar.or", "confidence": 0.73, "source": "proximity"}
|
| 148 |
+
{"relation_id": "778d96f611b6eed5", "source_entity_id": "3abc3cf6dfcc60ab", "target_entity_id": "ec472be46def442f", "relation_type": "related_to", "text_evidence": ". Copyright © 1993, 1994, Nikos Drakos, Computer Based Lea", "confidence": 0.982, "source": "proximity"}
|
| 149 |
+
{"relation_id": "1df9fd73b4fb0571", "source_entity_id": "3abc3cf6dfcc60ab", "target_entity_id": "a156e5e34505bff7", "relation_type": "related_to", "text_evidence": ". Copyright © 1993, 1994, Nikos Drakos, Computer Based Learning Unit, University of Leed", "confidence": 0.94, "source": "proximity"}
|
| 150 |
+
{"relation_id": "ae05b65d976d4a8f", "source_entity_id": "3abc3cf6dfcc60ab", "target_entity_id": "8ae61549fb9dbd7f", "relation_type": "related_to", "text_evidence": ". Copyright © 1993, 1994, Nikos Drakos, Computer Based Learning Unit, University of Leeds. URL: https://pdfs.", "confidence": 0.85, "source": "proximity"}
|
| 151 |
+
{"relation_id": "5a30bb1040678191", "source_entity_id": "3abc3cf6dfcc60ab", "target_entity_id": "e9177cbaa806b0f2", "relation_type": "related_to", "text_evidence": ". Copyright © 1993, 1994, Nikos Drakos, Computer Based Learning Unit, University of Leeds. URL: https://pdfs.semanticscholar.or", "confidence": 0.748, "source": "proximity"}
|
| 152 |
+
{"relation_id": "e483a9dbdf307b9f", "source_entity_id": "ec472be46def442f", "target_entity_id": "a156e5e34505bff7", "relation_type": "related_to", "text_evidence": "right © 1993, 1994, Nikos Drakos, Computer Based Learning Unit, University of Leed", "confidence": 0.958, "source": "proximity"}
|
| 153 |
+
{"relation_id": "24574ada1d7737ac", "source_entity_id": "ec472be46def442f", "target_entity_id": "8ae61549fb9dbd7f", "relation_type": "related_to", "text_evidence": "right © 1993, 1994, Nikos Drakos, Computer Based Learning Unit, University of Leeds. URL: https://pdfs.", "confidence": 0.868, "source": "proximity"}
|
| 154 |
+
{"relation_id": "41d7b7303e37ad12", "source_entity_id": "ec472be46def442f", "target_entity_id": "e9177cbaa806b0f2", "relation_type": "related_to", "text_evidence": "right © 1993, 1994, Nikos Drakos, Computer Based Learning Unit, University of Leeds. URL: https://pdfs.semanticscholar.or", "confidence": 0.766, "source": "proximity"}
|
| 155 |
+
{"relation_id": "977fe88984c04ecb", "source_entity_id": "a156e5e34505bff7", "target_entity_id": "e9177cbaa806b0f2", "relation_type": "related_to", "text_evidence": "1994, Nikos Drakos, Computer Based Learning Unit, University of Leeds. URL: https://pdfs.semanticscholar.or", "confidence": 0.808, "source": "proximity"}
|
| 156 |
+
{"relation_id": "0e300e8468619482", "source_entity_id": "8ae61549fb9dbd7f", "target_entity_id": "e9177cbaa806b0f2", "relation_type": "related_to", "text_evidence": "ased Learning Unit, University of Leeds. URL: https://pdfs.semanticscholar.or", "confidence": 0.898, "source": "proximity"}
|
| 157 |
+
{"relation_id": "3040fb6a65110409", "source_entity_id": "89a8294091d81ce8", "target_entity_id": "a0c1c19f89128b5b", "relation_type": "related_to", "text_evidence": "https://www.coursera.org/specializations/machine-learning-introduction Посиланная на інфор", "confidence": 0.913, "source": "proximity"}
|
| 158 |
+
{"relation_id": "9444e10cb00f50dd", "source_entity_id": "319103bce9ac4ecb", "target_entity_id": "7b046ef3d57edb7a", "relation_type": "related_to", "text_evidence": "ї спеціальність 122 Комп'ютерні науки освітня програма Комп'ютерні наук�� вид дисципліни обов’язкова Навчально-науковий інститут комп’ютерних наук та штучного інтелекту 2025 / 2026 навчальний р", "confidence": 0.556, "source": "proximity"}
|
| 159 |
+
{"relation_id": "8e2b6ab482b243e0", "source_entity_id": "7b046ef3d57edb7a", "target_entity_id": "991851bc4eb66ce7", "relation_type": "related_to", "text_evidence": " штучного інтелекту 2025 / 2026 навчальний рік Програму обговорено та рекомендовано до затвер", "confidence": 0.976, "source": "proximity"}
|
| 160 |
+
{"relation_id": "e0a4f30cecfe100e", "source_entity_id": "7b046ef3d57edb7a", "target_entity_id": "48c60bc68fad6568", "relation_type": "related_to", "text_evidence": " штучного інтелекту 2025 / 2026 навчальний рік Програму обговорено та рекомендовано до затвердження Вченою радою Навчально-наукового інституту комп’ютерних наук та штучного інтелекту “ 25” _вересня_ 2", "confidence": 0.43000000000000005, "source": "proximity"}
|
| 161 |
+
{"relation_id": "8ec64e53a7d5787e", "source_entity_id": "991851bc4eb66ce7", "target_entity_id": "48c60bc68fad6568", "relation_type": "related_to", "text_evidence": "о інтелекту 2025 / 2026 навчальний рік Програму обговорено та рекомендовано до затвердження Вченою радою Навчально-наукового інституту комп’ютерних наук та штучного інтелекту “ 25” _вересня_ 2025 року", "confidence": 0.45399999999999996, "source": "proximity"}
|
| 162 |
+
{"relation_id": "0e0fc51267c13e0d", "source_entity_id": "48c60bc68fad6568", "target_entity_id": "16a58157ceda2854", "relation_type": "related_to", "text_evidence": "вересня_ 2025 року, протокол № 1 РОЗРОБНИКИ ПРОГРАМИ: ДОНЕЦЬ Володимир Віталійович, Ph.D.", "confidence": 0.898, "source": "proximity"}
|
| 163 |
+
{"relation_id": "379e9f1b886c7983", "source_entity_id": "d06b53ab82edfc5c", "target_entity_id": "319103bce9ac4ecb", "relation_type": "related_to", "text_evidence": "рівень вищої освіти перший (бакалаврський) рівень галузь знань 12 Інформаційні технології спеціальність 122 Комп'ютерні науки освітня програма Ко", "confidence": 0.736, "source": "proximity"}
|
| 164 |
+
{"relation_id": "ac7207c7798f1312", "source_entity_id": "0f8293abf49fb235", "target_entity_id": "e3c29c94fe5f788a", "relation_type": "related_to", "text_evidence": "лекту «24» вересня 2024 року, протокол № 2 РОЗРОБНИКИ ПРОГРАМИ: КОВАЛЬЧУК Дмитро Миколайович, доктор філософії з", "confidence": 0.868, "source": "proximity"}
|
| 165 |
+
{"relation_id": "c91b8f82cff7bfe8", "source_entity_id": "0f8293abf49fb235", "target_entity_id": "b0c9f122b4dde681", "relation_type": "related_to", "text_evidence": "лекту «24» вересня 2024 року, протокол № 2 РОЗРОБНИКИ ПРОГРАМИ: КОВАЛЬЧУК Дмитро Миколайович, доктор філософії з спеціальності комп’ютерні науки, старший викладач кафедри математичного моделювання та ", "confidence": 0.571, "source": "proximity"}
|
| 166 |
+
{"relation_id": "5c937a3107b75fb8", "source_entity_id": "cef1dffeff86b2ad", "target_entity_id": "e3c29c94fe5f788a", "relation_type": "related_to", "text_evidence": " вересня 2024 року, протокол № 2 РОЗРОБНИКИ ПРОГРАМИ: КОВАЛЬЧУК Дмитро Миколайович, доктор філософії з", "confidence": 0.898, "source": "proximity"}
|
| 167 |
+
{"relation_id": "3b322f195bc9e909", "source_entity_id": "e3c29c94fe5f788a", "target_entity_id": "b0c9f122b4dde681", "relation_type": "related_to", "text_evidence": "ОЗРОБНИКИ ПРОГРАМИ: КОВАЛЬЧУК Дмитро Миколайович, доктор філософії з спеціальності комп’ютерні науки, старший викладач кафедри математичного моделювання та аналізу даних; Програму схвалено ", "confidence": 0.7030000000000001, "source": "proximity"}
|
| 168 |
+
{"relation_id": "1f52ecbac595ebe3", "source_entity_id": "58293efde3d65766", "target_entity_id": "f58e8e6f3ea6435a", "relation_type": "related_to", "text_evidence": "«Комп'ютерні науки» Віталіна БАБЕНКО Програму погоджено науково-методичною комісією Навчально-наукового інституту комп’ютерних наук та штучного інтелекту Протокол від 8 вересня 2024 року, № 1 Голова н", "confidence": 0.5589999999999999, "source": "proximity"}
|
| 169 |
+
{"relation_id": "2ba2b9fdab2f9faa", "source_entity_id": "f58e8e6f3ea6435a", "target_entity_id": "212a4cbee48972fb", "relation_type": "related_to", "text_evidence": "електу Протокол від 8 вересня 2024 року, № 1 Голова науково-методичної комісії Навчально-наукового інституту комп’ютерних наук та штучного інтелекту Євген ПОКЛОНСЬКИЙ 3 ВСТУП Програма на", "confidence": 0.613, "source": "proximity"}
|
| 170 |
+
{"relation_id": "86818e1308f94a22", "source_entity_id": "3661c2e944d40b82", "target_entity_id": "7195222d09304114", "relation_type": "related_to", "text_evidence": "Використання Tableau для аналізу великих даних 2 Разом 32 10", "confidence": 0.874, "source": "proximity"}
|
| 171 |
+
{"relation_id": "6ba90a2078abadb7", "source_entity_id": "8b2212d720e7654c", "target_entity_id": "98967461c0d03924", "relation_type": "related_to", "text_evidence": "ер. з англ. – Київ: Діалектика, 2021. – 520 с.", "confidence": 0.964, "source": "proximity"}
|
| 172 |
+
{"relation_id": "e50ac6e7f3a06a53", "source_entity_id": "3482d2ca636c03f4", "target_entity_id": "dfe899f01540e66e", "relation_type": "related_to", "text_evidence": "Миколаїв : Вид-во ЧНУ ім. Петра Могили, 2023. - 320 с.", "confidence": 0.913, "source": "proximity"}
|
| 173 |
+
{"relation_id": "6d4034839277657e", "source_entity_id": "ee26938da00b173f", "target_entity_id": "a8ce5d63880f52fa", "relation_type": "related_to", "text_evidence": "Чернівецький національний університет, 2024. 454 с. Інформаційн", "confidence": 0.883, "source": "proximity"}
|
| 174 |
+
{"relation_id": "bc33fddbe2ea89a5", "source_entity_id": "4f6771b02873d1a8", "target_entity_id": "ee872eca29098193", "relation_type": "related_to", "text_evidence": "https://www.microsoft.com/uk-ua/download/details.aspx?id=58494 14 6. Power BI documentation. [Електронний ресур", "confidence": 0.829, "source": "proximity"}
|
| 175 |
+
{"relation_id": "d562da94b00ab93b", "source_entity_id": "4f6771b02873d1a8", "target_entity_id": "e9177cbaa806b0f2", "relation_type": "related_to", "text_evidence": "https://learn.microsoft.com/pdf?url=https%3A%2F%2Fl", "confidence": 0.958, "source": "proximity"}
|
| 176 |
+
{"relation_id": "5cf6ebbcbbfce4a2", "source_entity_id": "7aedef9d8d8558b6", "target_entity_id": "b96397144e6ee4c1", "relation_type": "related_to", "text_evidence": "тва» В.о. декана ІІ Медичного факультету В.о. директора територіально відокремленого структурного підрозділу «Бахмутський навчально-науковий професійно-педагогічний інститут Харківського національного", "confidence": 0.41800000000000004, "source": "proximity"}
|
| 177 |
+
{"relation_id": "5e786268078e8a6b", "source_entity_id": "09d7289e764aa019", "target_entity_id": "b96397144e6ee4c1", "relation_type": "related_to", "text_evidence": "дагогічний інститут Харківського національного університету Денис КОВАЛЕНКО Ольга ПЄШКОВА Олекс", "confidence": 0.88, "source": "proximity"}
|
| 178 |
+
{"relation_id": "0f773fd2f6320c24", "source_entity_id": "09d7289e764aa019", "target_entity_id": "3e8c1df8b14f7b40", "relation_type": "related_to", "text_evidence": "дагогічний інститут Харківського національного університету Денис КОВАЛЕНКО Ольга ПЄШКОВА Олександр КОЗЛОВ 7.", "confidence": 0.832, "source": "proximity"}
|
| 179 |
+
{"relation_id": "dfce6195262dd668", "source_entity_id": "09d7289e764aa019", "target_entity_id": "89b95069493fa022", "relation_type": "related_to", "text_evidence": "дагогічний інститут Харківського національного університету Денис КОВАЛЕНКО Ольга ПЄШКОВА Олександр КОЗЛОВ 7.28. імені В.Н. Го", "confidence": 0.79, "source": "proximity"}
|
| 180 |
+
{"relation_id": "3bef530b02cfdfee", "source_entity_id": "09d7289e764aa019", "target_entity_id": "604e588ff8b579b8", "relation_type": "related_to", "text_evidence": "дагогічний інститут Харківського національного університету Денис КОВАЛЕНКО Ольга ПЄШКОВА Олександр КОЗЛОВ 7.28. імені В.Н. Голова первинної профспілкової організації студентів, аспірантів і докторант", "confidence": 0.45099999999999996, "source": "proximity"}
|
data/prebuilt/corpus/stats.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"processed_at": "2026-02-09T17:01:31.101524",
|
| 3 |
+
"document_count": 18,
|
| 4 |
+
"total_chunks": 374,
|
| 5 |
+
"total_entities": 264,
|
| 6 |
+
"total_relations": 180
|
| 7 |
+
}
|
data/prebuilt/graph/communities.json
ADDED
|
@@ -0,0 +1,693 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"community_id": "community_0_0",
|
| 4 |
+
"level": 0,
|
| 5 |
+
"node_ids": [
|
| 6 |
+
"29d505a0536ac998",
|
| 7 |
+
"89a8294091d81ce8",
|
| 8 |
+
"b66f51090366d95e",
|
| 9 |
+
"5f2a79a8d81b577c",
|
| 10 |
+
"5c3c6ed4d533a2c1",
|
| 11 |
+
"b66612a925339457",
|
| 12 |
+
"dd9e68038557a4d1",
|
| 13 |
+
"ca41d203d504603f",
|
| 14 |
+
"6f9322cda2097382",
|
| 15 |
+
"16cefbcfc3301ca1",
|
| 16 |
+
"71016125a13c0e0d",
|
| 17 |
+
"76c1b09c6ba5c29d",
|
| 18 |
+
"86eced7f7b77fe6a",
|
| 19 |
+
"ca36425b9fea1047",
|
| 20 |
+
"ed1e551c2e2c23f4",
|
| 21 |
+
"f33f68f0f3885639",
|
| 22 |
+
"212a4cbee48972fb",
|
| 23 |
+
"699ec9c1516b9078",
|
| 24 |
+
"6b2065ed53799d5d",
|
| 25 |
+
"bae673da8da75f34",
|
| 26 |
+
"9bd3d8195311b86a",
|
| 27 |
+
"ffe98e727478892d",
|
| 28 |
+
"e875322dc2569a22",
|
| 29 |
+
"06e267506e80e944",
|
| 30 |
+
"2e8b38e56500a7a3",
|
| 31 |
+
"189fbf0e885b2cd2",
|
| 32 |
+
"a0c1c19f89128b5b",
|
| 33 |
+
"346996767a29a82b",
|
| 34 |
+
"4ce0475ef4976b1f",
|
| 35 |
+
"991851bc4eb66ce7",
|
| 36 |
+
"00cd6ec8a2ea3f44",
|
| 37 |
+
"dd19c79674faa7bd",
|
| 38 |
+
"fc7e425c76ed0914",
|
| 39 |
+
"352c95a60b376529",
|
| 40 |
+
"0968094acb78c1ad",
|
| 41 |
+
"e90e15bb3ebc3b46",
|
| 42 |
+
"206d761f50f8f06d",
|
| 43 |
+
"98967461c0d03924",
|
| 44 |
+
"8c57b3adbf877d92",
|
| 45 |
+
"50eaaf0454fed8de",
|
| 46 |
+
"a96b08dadeee4ed1",
|
| 47 |
+
"7b046ef3d57edb7a",
|
| 48 |
+
"206d4facf8046312",
|
| 49 |
+
"f720170162ea75a3",
|
| 50 |
+
"48e8eb52059e5bab",
|
| 51 |
+
"b5bd38732575124b",
|
| 52 |
+
"6dbca2dfad017f88",
|
| 53 |
+
"922d634207f1bd3b",
|
| 54 |
+
"9d185bb8bfa967bc",
|
| 55 |
+
"7aedef9d8d8558b6",
|
| 56 |
+
"5d79062899f094ab",
|
| 57 |
+
"bfa00e8db74c2bff",
|
| 58 |
+
"913d7e8e7c2f67f4",
|
| 59 |
+
"6e95386ba46783ae",
|
| 60 |
+
"7a34d4c3a4e2a5d9",
|
| 61 |
+
"e27b59e793c2c29d",
|
| 62 |
+
"acdd1ddf99d6466e",
|
| 63 |
+
"54e5434714345859",
|
| 64 |
+
"e9177cbaa806b0f2",
|
| 65 |
+
"8ea18a7337f8bebc",
|
| 66 |
+
"1b9a3e20d4e2de7d",
|
| 67 |
+
"496bd95e14f270d0",
|
| 68 |
+
"84799c7a2e037df3",
|
| 69 |
+
"dd260601f883f45b",
|
| 70 |
+
"b8a44fca7a25d75e",
|
| 71 |
+
"59990e00fb6dc958",
|
| 72 |
+
"c6a80669b78e38d0",
|
| 73 |
+
"01cb9d90b7f7b0fb",
|
| 74 |
+
"d5bf12aea01ab74a",
|
| 75 |
+
"d5000b0072dd6b2e",
|
| 76 |
+
"1932d4efc6aed553",
|
| 77 |
+
"cf69446fcc226f76",
|
| 78 |
+
"bb0e47ca7d64c3a2",
|
| 79 |
+
"0e25ef31b4529625",
|
| 80 |
+
"c9fbd3e8cd885042",
|
| 81 |
+
"6b97bbdb92030366",
|
| 82 |
+
"319103bce9ac4ecb",
|
| 83 |
+
"245ea398e76f2b9d",
|
| 84 |
+
"5e1dde48ef03f452",
|
| 85 |
+
"355b77129964e5ba",
|
| 86 |
+
"604e588ff8b579b8",
|
| 87 |
+
"5af4e9f79989b6cc",
|
| 88 |
+
"c0b73e547fb01a0d",
|
| 89 |
+
"4e3a6a778bd02a9a",
|
| 90 |
+
"89b95069493fa022",
|
| 91 |
+
"ce706d1201dd22bb",
|
| 92 |
+
"701e78c42935ac92",
|
| 93 |
+
"2396ca191bb74a2f",
|
| 94 |
+
"b18f2b51c06200ef",
|
| 95 |
+
"0c36a78449184e85",
|
| 96 |
+
"065ba63a18a9cf96",
|
| 97 |
+
"0fedc5641985ff4f",
|
| 98 |
+
"2c8678efa06f67f9",
|
| 99 |
+
"b972a394b89fc296",
|
| 100 |
+
"a156e5e34505bff7",
|
| 101 |
+
"ceae3c879404fdc2",
|
| 102 |
+
"4c8642012c1b3a70",
|
| 103 |
+
"01c24beb989b4039",
|
| 104 |
+
"5d47659c01a5bfe0",
|
| 105 |
+
"79f52309b208feba",
|
| 106 |
+
"b595cc5c5dbc9f27",
|
| 107 |
+
"b88bcdbd0c134780",
|
| 108 |
+
"48c60bc68fad6568",
|
| 109 |
+
"948493e60578ae31",
|
| 110 |
+
"0e97e28b90683458",
|
| 111 |
+
"055172975b24840d",
|
| 112 |
+
"b39b76920d6df157",
|
| 113 |
+
"d717d12183a88d4b",
|
| 114 |
+
"096b111a255bad7f",
|
| 115 |
+
"122b3fe2e920d773",
|
| 116 |
+
"322cec6ce3bc2db5",
|
| 117 |
+
"dfe899f01540e66e",
|
| 118 |
+
"d62471d8c2e91d44",
|
| 119 |
+
"d06b53ab82edfc5c",
|
| 120 |
+
"58293efde3d65766",
|
| 121 |
+
"f09ce6171ec29ca4",
|
| 122 |
+
"fb8e867f51196d1e",
|
| 123 |
+
"1a39e93b2f96d97a",
|
| 124 |
+
"8903da6c47c5b405",
|
| 125 |
+
"c13c1778434d4401",
|
| 126 |
+
"d9e6447f15698416",
|
| 127 |
+
"3159a3e3e182afdb",
|
| 128 |
+
"250d9f3642fd0543",
|
| 129 |
+
"f58e8e6f3ea6435a",
|
| 130 |
+
"cc66c883cdb05b6c",
|
| 131 |
+
"f9491e067e6094a3",
|
| 132 |
+
"7f6f05d5d2de017e",
|
| 133 |
+
"8e0e8ac292079e8f",
|
| 134 |
+
"dfa8525adddea21c",
|
| 135 |
+
"782aee541767578b",
|
| 136 |
+
"8a79109dedafded4",
|
| 137 |
+
"ee26938da00b173f",
|
| 138 |
+
"97722021b1a2792b",
|
| 139 |
+
"3d510e2163671925",
|
| 140 |
+
"6ad061c8308163a4",
|
| 141 |
+
"3e8c1df8b14f7b40",
|
| 142 |
+
"905eb46730ee5067",
|
| 143 |
+
"c2cb6858473acfca",
|
| 144 |
+
"784c19bb97840e78",
|
| 145 |
+
"5bd6e1530b6209b9",
|
| 146 |
+
"be6b689037cfd3f5",
|
| 147 |
+
"8f6a3bfa88668fda",
|
| 148 |
+
"6a877c5c22acd65d",
|
| 149 |
+
"baaee1ebf82f1ca0",
|
| 150 |
+
"0c32e3788d1919fe",
|
| 151 |
+
"4fed50ed9d1c3c43",
|
| 152 |
+
"d3934aa41810502a",
|
| 153 |
+
"9b44aac7b71f3b81",
|
| 154 |
+
"7d417c302db221f8",
|
| 155 |
+
"86fd6e5fe8e9acc5",
|
| 156 |
+
"8b2212d720e7654c",
|
| 157 |
+
"43c51e4e3af97e3b",
|
| 158 |
+
"b0c9f122b4dde681",
|
| 159 |
+
"c0c976dc9f08ee09",
|
| 160 |
+
"8b39040d3506de46",
|
| 161 |
+
"6b557e862a4e7a83",
|
| 162 |
+
"13f2eb73585baa18",
|
| 163 |
+
"290254ba61badf4d",
|
| 164 |
+
"c8c2ed4483d9e387",
|
| 165 |
+
"4f6771b02873d1a8",
|
| 166 |
+
"b96397144e6ee4c1",
|
| 167 |
+
"e3c29c94fe5f788a",
|
| 168 |
+
"d782275144689a32",
|
| 169 |
+
"60102cdf4d854f34",
|
| 170 |
+
"cf56ebddf5a35607",
|
| 171 |
+
"bd8c382a93b83173",
|
| 172 |
+
"41dd209cb0943b81",
|
| 173 |
+
"8fe70805d62442fe",
|
| 174 |
+
"45aa5067f79407c7",
|
| 175 |
+
"16a58157ceda2854",
|
| 176 |
+
"cef1dffeff86b2ad",
|
| 177 |
+
"5187796566007fb7",
|
| 178 |
+
"8ae61549fb9dbd7f",
|
| 179 |
+
"4e1c9e09669d9739",
|
| 180 |
+
"70fbedafe4c03090",
|
| 181 |
+
"825552a12c15e926",
|
| 182 |
+
"4608f9da19534fdf",
|
| 183 |
+
"9696f2942cbcefbf",
|
| 184 |
+
"970141d598e2b973",
|
| 185 |
+
"d02c0c0e7726d3f1",
|
| 186 |
+
"76840deb548f6613",
|
| 187 |
+
"272fc3de5af4b271",
|
| 188 |
+
"986d05c4d03adaf6",
|
| 189 |
+
"34c3561b548abce9",
|
| 190 |
+
"00cf239d3fdd3c0b",
|
| 191 |
+
"3abc3cf6dfcc60ab",
|
| 192 |
+
"d9262d20fe39c685",
|
| 193 |
+
"d6caffc95d4ea230",
|
| 194 |
+
"4f1d66c8ddf1b649",
|
| 195 |
+
"9911b886b4e40ea5",
|
| 196 |
+
"05094b9aa362a52d",
|
| 197 |
+
"938619211ac368a8",
|
| 198 |
+
"c4f97b6cbc55754d",
|
| 199 |
+
"50c3bc64c15d6a7b",
|
| 200 |
+
"b5dc06eb5bd93776",
|
| 201 |
+
"9ae9a1669500ffa1",
|
| 202 |
+
"97cd6dc5749e6b61",
|
| 203 |
+
"ed929af314e5c217",
|
| 204 |
+
"27539ade77e10aa5",
|
| 205 |
+
"7da8314b0a04d391",
|
| 206 |
+
"50b9cb80c26211e4",
|
| 207 |
+
"8ef745e21a875f72",
|
| 208 |
+
"5bce0cd172cf9389",
|
| 209 |
+
"49f9040d6c533349",
|
| 210 |
+
"0969f1b2971aa784",
|
| 211 |
+
"7195222d09304114",
|
| 212 |
+
"37948bdfda119408",
|
| 213 |
+
"5500e778ecf224d5",
|
| 214 |
+
"5893788174cc1112",
|
| 215 |
+
"c8c030a0bccb918c",
|
| 216 |
+
"8106aa9561ab7c2f",
|
| 217 |
+
"9b1d2e56d20f1ed0",
|
| 218 |
+
"310090e8deaa9df2",
|
| 219 |
+
"aea9a80c0b30ac81",
|
| 220 |
+
"ae903905b511ec19",
|
| 221 |
+
"867422b0430e1564",
|
| 222 |
+
"a900f9c64ad40b8c",
|
| 223 |
+
"cb0eaa459b89d023",
|
| 224 |
+
"eac8143bbdabd053",
|
| 225 |
+
"eccdae69f659f373",
|
| 226 |
+
"ee872eca29098193",
|
| 227 |
+
"c41170818cb6c11a",
|
| 228 |
+
"a0afc115cc35343d",
|
| 229 |
+
"7d7502050cc53cdd",
|
| 230 |
+
"ebfc6922e8102c64",
|
| 231 |
+
"44ff037b4be33247",
|
| 232 |
+
"c6ff2def708b661a",
|
| 233 |
+
"615aeabe0d2e90c4",
|
| 234 |
+
"906125f79a596b75",
|
| 235 |
+
"72c3ce0a8eee26de",
|
| 236 |
+
"730864a279fa38ed",
|
| 237 |
+
"c2f2ef49af307713",
|
| 238 |
+
"cf3cd7d2b4d5ff70",
|
| 239 |
+
"2f56f53dd8d0ca61",
|
| 240 |
+
"3482d2ca636c03f4",
|
| 241 |
+
"69a63c2f8c3447d6",
|
| 242 |
+
"f74e210776727dd2",
|
| 243 |
+
"a7c0a7066c2e9018",
|
| 244 |
+
"b445c482d619deb7",
|
| 245 |
+
"b782f37083fcb21e",
|
| 246 |
+
"ec472be46def442f",
|
| 247 |
+
"3661c2e944d40b82",
|
| 248 |
+
"eae30d306871e586",
|
| 249 |
+
"4edfea8258d94845",
|
| 250 |
+
"34fb96ce85d84537",
|
| 251 |
+
"60d10086ca90e035",
|
| 252 |
+
"6ee638ce41216594",
|
| 253 |
+
"b8ade73e2d5d467f",
|
| 254 |
+
"31746da19135eb78",
|
| 255 |
+
"9046fecf16ca5776",
|
| 256 |
+
"0de8a786de8a293a",
|
| 257 |
+
"409f31faf973efbb",
|
| 258 |
+
"c73573bcb8a98942",
|
| 259 |
+
"0f8293abf49fb235",
|
| 260 |
+
"f8d71c90965bef12",
|
| 261 |
+
"374bf375044352a6",
|
| 262 |
+
"2a8fb4bebe84aa01",
|
| 263 |
+
"1d00762131f7927c",
|
| 264 |
+
"c50aef3ee34c16b0",
|
| 265 |
+
"07624d428730f41a",
|
| 266 |
+
"0d39e7b35c5a076e",
|
| 267 |
+
"0fb142c0536723a5",
|
| 268 |
+
"581d47447ce9c3f4",
|
| 269 |
+
"43f1fa06d0fb0bd7",
|
| 270 |
+
"05fd3595bf5c36de",
|
| 271 |
+
"db6dd07ac3e3c6b9",
|
| 272 |
+
"48dc9a385ba4fbd8",
|
| 273 |
+
"09d7289e764aa019",
|
| 274 |
+
"a8ce5d63880f52fa",
|
| 275 |
+
"aa046696ad983530",
|
| 276 |
+
"19f4b16b62f7d2ac",
|
| 277 |
+
"3324747a90504200",
|
| 278 |
+
"d82a8d307fd8eb65"
|
| 279 |
+
],
|
| 280 |
+
"summary": "Key entities: Комп'ютерні науки, протокол № 2, кафедри математичного моделювання та аналізу даних, Євгеній ПОКЛОНСЬКИЙ, Володимир СТРУКОВ. Topics: Міністерство освіти і науки України; MIHICTEPCTBO OCBITI4 I HAYKII YKPAIITTT; MIHICTEPCTBO OCBITII I HA)TKI,I YKPAIHI4",
|
| 281 |
+
"key_entities": [
|
| 282 |
+
"Комп'ютерні науки",
|
| 283 |
+
"протокол № 2",
|
| 284 |
+
"кафедри математичного моделювання та аналізу даних",
|
| 285 |
+
"Євгеній ПОКЛОНСЬКИЙ",
|
| 286 |
+
"Володимир СТРУКОВ",
|
| 287 |
+
"студент",
|
| 288 |
+
"23 вересня 2024 року",
|
| 289 |
+
"МІШИН Олександр Вікторович"
|
| 290 |
+
],
|
| 291 |
+
"key_topics": [
|
| 292 |
+
"Міністерство освіти і науки України",
|
| 293 |
+
"MIHICTEPCTBO OCBITI4 I HAYKII YKPAIITTT",
|
| 294 |
+
"MIHICTEPCTBO OCBITII I HA)TKI,I YKPAIHI4",
|
| 295 |
+
"MIHICTEPCTBO ocBITrI I HAyKI,I yKpAiuI,t",
|
| 296 |
+
"Міністерство освіти і науки України"
|
| 297 |
+
],
|
| 298 |
+
"size": 273
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"community_id": "community_0_1",
|
| 302 |
+
"level": 0,
|
| 303 |
+
"node_ids": [
|
| 304 |
+
"ff9c88b3a7fb7942",
|
| 305 |
+
"7d2fabee274a867b",
|
| 306 |
+
"c0031f76c1b5be0a",
|
| 307 |
+
"aec957bff3333594",
|
| 308 |
+
"7ba9bd963e52750c",
|
| 309 |
+
"55caa1ade06bdf64",
|
| 310 |
+
"ef794c82b54bd7fa",
|
| 311 |
+
"9e41a312d020a671",
|
| 312 |
+
"bef65c47c7457b9b",
|
| 313 |
+
"e6d769da18887031",
|
| 314 |
+
"d95be0bb9a2a5e85",
|
| 315 |
+
"3725adad1a375aae",
|
| 316 |
+
"98bae9ab7f84fab6",
|
| 317 |
+
"c0c2349542f40602",
|
| 318 |
+
"2a766068f1b63784",
|
| 319 |
+
"82e419ef4b6a4f3a",
|
| 320 |
+
"d6aab886d9134bde",
|
| 321 |
+
"e2a880a0cee5051c",
|
| 322 |
+
"9d6f6cdfde3187d2",
|
| 323 |
+
"b29667928e818bb6",
|
| 324 |
+
"d89f7445018e510e",
|
| 325 |
+
"17fca38c5fe9bbe2"
|
| 326 |
+
],
|
| 327 |
+
"summary": "Key entities: Комп'ютерні науки, БАБЕНКО Віталіна Олексіївна, бакалавр, ННІ комп’ютерних наук та штучного інтелекту, студент. Topics: 1. Профіль освітньої програми «Комп’ютерні науки»; 1. Профіль освітньої програми «Комп’ютерні науки»; 1. Профіль освітньої програми «Комп’ютерні науки». Content: До захисту кваліфікаційної роботи бакалавра допускаються студенти, які виконали всі вимоги навчального плану. Захист кваліфікаційної роботи бакалавра має своєю метою з’ясування рівня підготовленості в Каразіна, кафедра штучного інтелекту та програмного забезпечення, кафедра моделювання систем і техн",
|
| 328 |
+
"key_entities": [
|
| 329 |
+
"Комп'ютерні науки",
|
| 330 |
+
"БАБЕНКО Віталіна Олексіївна",
|
| 331 |
+
"бакалавр",
|
| 332 |
+
"ННІ комп’ютерних наук та штучного інтелекту",
|
| 333 |
+
"студент",
|
| 334 |
+
"розпізнавання",
|
| 335 |
+
"класифікації",
|
| 336 |
+
"МІХЄЄВ Іван Андрійович"
|
| 337 |
+
],
|
| 338 |
+
"key_topics": [
|
| 339 |
+
"1. Профіль освітньої програми «Комп’ютерні науки»",
|
| 340 |
+
"1. Профіль освітньої програми «Комп’ютерні науки»",
|
| 341 |
+
"1. Профіль освітньої програми «Комп’ютерні науки»",
|
| 342 |
+
"1. Профіль освітньої програми «Комп’ютерні науки»",
|
| 343 |
+
"1. Обов’язкові компоненти ОП"
|
| 344 |
+
],
|
| 345 |
+
"size": 22
|
| 346 |
+
},
|
| 347 |
+
{
|
| 348 |
+
"community_id": "community_0_2",
|
| 349 |
+
"level": 0,
|
| 350 |
+
"node_ids": [
|
| 351 |
+
"22cdbb71f4bf1575",
|
| 352 |
+
"ad982e91da077a15",
|
| 353 |
+
"e10dfcaded37865c",
|
| 354 |
+
"b3bac256cd6b3f60",
|
| 355 |
+
"76df204594c6c6fc",
|
| 356 |
+
"83fdca75c9a45216",
|
| 357 |
+
"e82765265030c0b3",
|
| 358 |
+
"f5a9189542062792",
|
| 359 |
+
"d42293ae57732392",
|
| 360 |
+
"4f19ac3696aba26d",
|
| 361 |
+
"a85130ce52e9d5ad",
|
| 362 |
+
"07abb3f4f4f1acfc",
|
| 363 |
+
"e3dc8b7767957357",
|
| 364 |
+
"0772f4f3d26b8789",
|
| 365 |
+
"b68c19927a7e67f9",
|
| 366 |
+
"60ad204aaed1308f",
|
| 367 |
+
"ca545a46260dd835",
|
| 368 |
+
"f6ee4dbcc8c3594d",
|
| 369 |
+
"f648911d33733b47"
|
| 370 |
+
],
|
| 371 |
+
"summary": "Key entities: комп’ютерної інженерії, Харківського національного університету імені В. Каразіна, професорами, ЕПАМ, однієї або більше академічних груп. Topics: 1. Профіль освітньої програми «Комп’ютерна інженерія»; 1. Профіль освітньої програми «Комп’ютерна інженерія»; 5. Матриця забезпечення програмних результатів навчання (ПРН). Content: Форми атестації здобувачів вищої освіти Атестація осіб, які здобувають ступінь бакалавра з комп’ютерної інженерії, проводиться у формі атестаційного екзамену і захисту кваліфікаційної роботи бакалавра У освітньому процесі використовуються такі види контролю: вхідний, поточний, підсумковий у вигляді ",
|
| 372 |
+
"key_entities": [
|
| 373 |
+
"комп’ютерної інженерії",
|
| 374 |
+
"Харківського національного університету імені В. Каразіна",
|
| 375 |
+
"професорами",
|
| 376 |
+
"ЕПАМ",
|
| 377 |
+
"однієї або більше академічних груп",
|
| 378 |
+
"МІХЄЄВ Іван Андрійович",
|
| 379 |
+
"Стрілець Вікторія Євгенівна",
|
| 380 |
+
"240 кредитів ЄКТС"
|
| 381 |
+
],
|
| 382 |
+
"key_topics": [
|
| 383 |
+
"1. Профіль освітньої програми «Комп’ютерна інженерія»",
|
| 384 |
+
"1. Профіль освітньої програми «Комп’ютерна інженерія»",
|
| 385 |
+
"5. Матриця забезпечення програмних результатів навчання (ПРН)",
|
| 386 |
+
"4. Матриця відповідності програмних компетентностей",
|
| 387 |
+
"3. Форма атестації здобувачів вищої освіти"
|
| 388 |
+
],
|
| 389 |
+
"size": 19
|
| 390 |
+
},
|
| 391 |
+
{
|
| 392 |
+
"community_id": "community_0_3",
|
| 393 |
+
"level": 0,
|
| 394 |
+
"node_ids": [
|
| 395 |
+
"000ecf882de345cd",
|
| 396 |
+
"e552699b03b077fe",
|
| 397 |
+
"ba847fbe086d51b1",
|
| 398 |
+
"2fdeb829268a59b3",
|
| 399 |
+
"d1c5ea058e826417",
|
| 400 |
+
"6feefa140fa3a2d4",
|
| 401 |
+
"7c5467d53d80fcdd",
|
| 402 |
+
"cd050aec27af9add",
|
| 403 |
+
"5d3f888365936987",
|
| 404 |
+
"239cfef495d5fcb0",
|
| 405 |
+
"51a1661d308c3929",
|
| 406 |
+
"e640d430651812eb",
|
| 407 |
+
"070c0b50ef0891a7",
|
| 408 |
+
"06181b90d562d99a",
|
| 409 |
+
"2a282e6e2664aecf",
|
| 410 |
+
"62c7daff1b3e2b0d",
|
| 411 |
+
"b2a06019710d6478",
|
| 412 |
+
"219dd7d878ce9601",
|
| 413 |
+
"5f91796791f7b0d0",
|
| 414 |
+
"b5afffb06306c471",
|
| 415 |
+
"d44febd45afddc60",
|
| 416 |
+
"40cabbdcd801245c"
|
| 417 |
+
],
|
| 418 |
+
"summary": "Key entities: електротехніку, Харківського національного університету імені В. Каразіна, бакалавр, спеціальності, Автоматизація та комп’ютерно- інтегровані технології. Topics: 1. Профіль освітньої програми «Автоматизація та комп’ютерно-інтегровані технології» зі; 3. Форма атестації здобувачів вищої освіти; 1. Профіль освітньої програми «Автоматизація та комп’ютерно-інтегровані технології» зі. Content: Форми атестації здобувачів вищої освіти Атестація осіб, які здобувають ступінь бакалавра з автоматизації, комп’ютерно-інтегрованих технологій та робототехніки проводиться у формі атестаційного екзамен Kpf-S' Nt (,,/t/- 4 Bo-neAarori.rHoi po6orn Eopnc CAMOPOAOB ffi ffi JrrrcT rroroAxrEHHs ofrfr Hayxo",
|
| 419 |
+
"key_entities": [
|
| 420 |
+
"електротехніку",
|
| 421 |
+
"Харківського національного університету імені В. Каразіна",
|
| 422 |
+
"бакалавр",
|
| 423 |
+
"спеціальності",
|
| 424 |
+
"Автоматизація та комп’ютерно- інтегровані технології",
|
| 425 |
+
"професорами",
|
| 426 |
+
"ЕПАМ",
|
| 427 |
+
"однієї або більше академічних груп"
|
| 428 |
+
],
|
| 429 |
+
"key_topics": [
|
| 430 |
+
"1. Профіль освітньої програми «Автоматизація та комп’ютерно-інтегровані технології» зі",
|
| 431 |
+
"3. Форма атестації здобувачів вищої освіти",
|
| 432 |
+
"1. Профіль освітньої програми «Автоматизація та комп’ютерно-інтегровані технології» зі",
|
| 433 |
+
"1. Профіль освітньої програми «Автоматизація та комп’ютерно-інтегровані технології» зі",
|
| 434 |
+
"1. Профіль освітньої програми «Автоматизація та комп’ютерно-інтегровані технології» зі"
|
| 435 |
+
],
|
| 436 |
+
"size": 22
|
| 437 |
+
},
|
| 438 |
+
{
|
| 439 |
+
"community_id": "community_0_4",
|
| 440 |
+
"level": 0,
|
| 441 |
+
"node_ids": [
|
| 442 |
+
"afedd4c94bd030de"
|
| 443 |
+
],
|
| 444 |
+
"summary": "Key entities: військовий квиток або приписне посвідчення. Content: Перелік документів для вступу документ державного зразка про раніше здобутий освітній рівень, на основі яког о здійснюєтьс я всту п (атестат) , і додаток до нього; до нього; результат Національного му",
|
| 445 |
+
"key_entities": [
|
| 446 |
+
"військовий квиток або приписне посвідчення"
|
| 447 |
+
],
|
| 448 |
+
"key_topics": [],
|
| 449 |
+
"size": 1
|
| 450 |
+
},
|
| 451 |
+
{
|
| 452 |
+
"community_id": "community_0_5",
|
| 453 |
+
"level": 0,
|
| 454 |
+
"node_ids": [
|
| 455 |
+
"d624229769e51fdb",
|
| 456 |
+
"ae6dfdb9a7d2be03",
|
| 457 |
+
"a3f7f078e86de037",
|
| 458 |
+
"bbb2d6d6809f3ad9",
|
| 459 |
+
"c1f0d26d3293af30",
|
| 460 |
+
"a2870e4a7dc5eaf7",
|
| 461 |
+
"1f7deed0ad1b97b8",
|
| 462 |
+
"d2f99ab8996bbca3",
|
| 463 |
+
"a9d561875549b1d7",
|
| 464 |
+
"9f93734072831671",
|
| 465 |
+
"e8963556b85f7295",
|
| 466 |
+
"79c4619dc97f67c5",
|
| 467 |
+
"9f9129eb9ea4105d",
|
| 468 |
+
"c9333c7f250d9980",
|
| 469 |
+
"fb83621a8920b5de",
|
| 470 |
+
"62ae455a86fcce18",
|
| 471 |
+
"6c9896a73eb9dadb",
|
| 472 |
+
"c39eeedf1701877c",
|
| 473 |
+
"4f431acaf7aa3091",
|
| 474 |
+
"c8a70c290d054a7f",
|
| 475 |
+
"65cea25048e03600",
|
| 476 |
+
"fc592b77397bf392",
|
| 477 |
+
"0eb62e34341fc885",
|
| 478 |
+
"d45705c9d6499369",
|
| 479 |
+
"096a4efb16a2129a",
|
| 480 |
+
"3d8cbb26e53df2ae",
|
| 481 |
+
"3d732d1deab61257",
|
| 482 |
+
"43d1da5288d7920a",
|
| 483 |
+
"d2f3c36b8ca97a55",
|
| 484 |
+
"98b2724ab31d2fff",
|
| 485 |
+
"b8c1123eb432e346",
|
| 486 |
+
"96cc9792297b04c8",
|
| 487 |
+
"20120b0b0d706b71",
|
| 488 |
+
"daf8c3052517c0c4",
|
| 489 |
+
"e40c978ac81c8e56",
|
| 490 |
+
"ebbeae105f902e16",
|
| 491 |
+
"2116032ea505ca64",
|
| 492 |
+
"5834b4ab5eb62ddc",
|
| 493 |
+
"7608a58e8c99674f"
|
| 494 |
+
],
|
| 495 |
+
"summary": "Key entities: II семестр, студент, тести, 12 3 2 2 5, 2019. Topics: 6. Допоміжні індивідуальні завдання; Розділ 3. Процедурні розширення SQL; Розділ 1. Основи SQL та маніпулювання даними.. Content: Для студентів, які навчаються відповідно до індивідуального плану, лекторами можуть видані допоміжні індивідуальні завдання, в залежності від їх рі��ня підготовки та умов праці. Розробляються ці допомі Критерії оцінювання екзамену Теоретичні завдання оцінюються таким чином: • 100% від запланованої кі",
|
| 496 |
+
"key_entities": [
|
| 497 |
+
"II семестр",
|
| 498 |
+
"студент",
|
| 499 |
+
"тести",
|
| 500 |
+
"12 3 2 2 5",
|
| 501 |
+
"2019",
|
| 502 |
+
"Комп'ютерні науки",
|
| 503 |
+
"протокол № 2",
|
| 504 |
+
"кафедри математичного моделювання та аналізу даних"
|
| 505 |
+
],
|
| 506 |
+
"key_topics": [
|
| 507 |
+
"6. Допоміжні індивідуальні завдання",
|
| 508 |
+
"Розділ 3. Процедурні розширення SQL",
|
| 509 |
+
"Розділ 1. Основи SQL та маніпулювання даними.",
|
| 510 |
+
"Розділ 1. Основи SQL та маніпулювання даними",
|
| 511 |
+
"Розділ 1"
|
| 512 |
+
],
|
| 513 |
+
"size": 39
|
| 514 |
+
},
|
| 515 |
+
{
|
| 516 |
+
"community_id": "community_0_6",
|
| 517 |
+
"level": 0,
|
| 518 |
+
"node_ids": [
|
| 519 |
+
"e387576a2920a178",
|
| 520 |
+
"9e384c927f76dbf0",
|
| 521 |
+
"04f251f674cdea32",
|
| 522 |
+
"ef69db613b1f8839",
|
| 523 |
+
"8d3d649dcf826b56",
|
| 524 |
+
"4c7146324ce73435",
|
| 525 |
+
"83c09122974e4dd1",
|
| 526 |
+
"9749c0c98e2d3561",
|
| 527 |
+
"0ac42e4a5b390373",
|
| 528 |
+
"80f53cdccfc87c4e",
|
| 529 |
+
"39ec90a64e5db59f",
|
| 530 |
+
"12639a6d0134394d",
|
| 531 |
+
"3fa3e2126481683a",
|
| 532 |
+
"193f6c0d3c4a1271",
|
| 533 |
+
"d6b188f4e2232a2c",
|
| 534 |
+
"3ce4e4972c2988b2",
|
| 535 |
+
"eef99ef6dde1ae05",
|
| 536 |
+
"34983f681748b1c6",
|
| 537 |
+
"894d9130656e54a7",
|
| 538 |
+
"4377f6786a5a41fd",
|
| 539 |
+
"2a3122d2a379c1ce",
|
| 540 |
+
"0b6a89b240bf5698",
|
| 541 |
+
"8a9f606791f188a9",
|
| 542 |
+
"86f1ca29b96afbf4",
|
| 543 |
+
"7309e0bd987b1588",
|
| 544 |
+
"a726ac326d586236",
|
| 545 |
+
"256daeeb4709366c",
|
| 546 |
+
"ff3d29a7c347f81f"
|
| 547 |
+
],
|
| 548 |
+
"summary": "Key entities: студент, Комп'ютерні науки, протокол № 2, github, практичних навичок. Topics: 5. Denny Britz, Learning Reinforcement Learning (with Code, Exercises and Solutions),; Розділ 1; 2. https://scikit-. Content: Критерії оцінювання знань студентів під час підсумкового контролю 16-20 балів: - студент демонструє глибоке розуміння теми питання; - студент повністю розкриває сутність питання; - в роботі наведені п Відвідування лекцій: 5 балів: студент відвідав 90 - 100 % лекційних занять; 4 бали: студент відвіда",
|
| 549 |
+
"key_entities": [
|
| 550 |
+
"студент",
|
| 551 |
+
"Комп'ютерні науки",
|
| 552 |
+
"протокол № 2",
|
| 553 |
+
"github",
|
| 554 |
+
"практичних навичок",
|
| 555 |
+
"2019",
|
| 556 |
+
"Manning Publications",
|
| 557 |
+
"кафедри математичного моделювання та аналізу даних"
|
| 558 |
+
],
|
| 559 |
+
"key_topics": [
|
| 560 |
+
"5. Denny Britz, Learning Reinforcement Learning (with Code, Exercises and Solutions),",
|
| 561 |
+
"Розділ 1",
|
| 562 |
+
"2. https://scikit-",
|
| 563 |
+
"1.2. Основними завданнями вивчення дисципліни є:",
|
| 564 |
+
"Розділ 1. Методи машинного навчання"
|
| 565 |
+
],
|
| 566 |
+
"size": 28
|
| 567 |
+
},
|
| 568 |
+
{
|
| 569 |
+
"community_id": "community_0_7",
|
| 570 |
+
"level": 0,
|
| 571 |
+
"node_ids": [
|
| 572 |
+
"0e0658e7bdcc2a7c",
|
| 573 |
+
"6ce78cb4df323fb4",
|
| 574 |
+
"e935c846d8eafba4",
|
| 575 |
+
"77436f64ff77291a",
|
| 576 |
+
"ba16757a34c2d1dd",
|
| 577 |
+
"99190c180bdba5e1",
|
| 578 |
+
"5a49d50c4b784809",
|
| 579 |
+
"625124b98bff96e2",
|
| 580 |
+
"51c002d66b88ec4f",
|
| 581 |
+
"6c9a5cda2d241944",
|
| 582 |
+
"47cac461812d84b6",
|
| 583 |
+
"3aa93846fb2a2d54",
|
| 584 |
+
"0bef74a36daf31d1",
|
| 585 |
+
"448e666ba8fe40f9",
|
| 586 |
+
"54fd7d05fdccb9d3",
|
| 587 |
+
"0facfaf0f9bc4723",
|
| 588 |
+
"ff5b718ccb21d9ec",
|
| 589 |
+
"004de6bc3f3b2ffb",
|
| 590 |
+
"e2fbd3aa81f89018",
|
| 591 |
+
"a5282eb098a9de3b",
|
| 592 |
+
"42aff4b4c6cfc199",
|
| 593 |
+
"f6efa28179d67e17",
|
| 594 |
+
"c804c79f13931e2e",
|
| 595 |
+
"4a5e38c24938d7da",
|
| 596 |
+
"b5e350ee6dd237f5",
|
| 597 |
+
"06f02e5985dacd62",
|
| 598 |
+
"b123ad063fd6e616",
|
| 599 |
+
"fcf14927820636da",
|
| 600 |
+
"65a8851a10a46f61"
|
| 601 |
+
],
|
| 602 |
+
"summary": "Key entities: студент, Комп'ютерні науки, github, практичних навичок, 0 балів. Topics: 1.2. Основними завданнями вивчення дисципліни є:; 5. Denny Britz, Learning Reinforcement Learning (with Code, Exercises and Solutions),; 1.1. Метою викладання навчальної дисципліни є формування у студентів. Content: Критерії оцінювання знань студентів під час підсумкового контролю 16-20 балів: - студент демонструє глибоке розуміння теми питання; - студент повністю розкриває с��тність питання; - в роботі наведені п Відвідування лекцій: 5 балів: студент відвідав 90 - 100 % лекційних занять; 4 бали: студент відвіда",
|
| 603 |
+
"key_entities": [
|
| 604 |
+
"студент",
|
| 605 |
+
"Комп'ютерні науки",
|
| 606 |
+
"github",
|
| 607 |
+
"практичних навичок",
|
| 608 |
+
"0 балів",
|
| 609 |
+
"2019",
|
| 610 |
+
"Manning Publications",
|
| 611 |
+
"протокол № 2"
|
| 612 |
+
],
|
| 613 |
+
"key_topics": [
|
| 614 |
+
"1.2. Основними завданнями вивчення дисципліни є:",
|
| 615 |
+
"5. Denny Britz, Learning Reinforcement Learning (with Code, Exercises and Solutions),",
|
| 616 |
+
"1.1. Метою викладання навчальної дисципліни є формування у студентів",
|
| 617 |
+
"1.5. Характеристика навчальної дисципліни",
|
| 618 |
+
"Розділ 1"
|
| 619 |
+
],
|
| 620 |
+
"size": 29
|
| 621 |
+
},
|
| 622 |
+
{
|
| 623 |
+
"community_id": "community_0_8",
|
| 624 |
+
"level": 0,
|
| 625 |
+
"node_ids": [
|
| 626 |
+
"5ca530193ac98906",
|
| 627 |
+
"0a0ed90dec090afb",
|
| 628 |
+
"783865a5bc89987e",
|
| 629 |
+
"ec194f577de382a2",
|
| 630 |
+
"9f7180e50738c510",
|
| 631 |
+
"b99ad375f679e170",
|
| 632 |
+
"6713fd91876278de",
|
| 633 |
+
"988e4b857edf2389",
|
| 634 |
+
"c45c28112723c377",
|
| 635 |
+
"cb820b8eef70d979",
|
| 636 |
+
"c8b5f45e988bfbdc",
|
| 637 |
+
"746f49fefb9761f2",
|
| 638 |
+
"4fa8387b8db3832a",
|
| 639 |
+
"8d359804040b7767",
|
| 640 |
+
"531476c443d08c99",
|
| 641 |
+
"e46db54fa020b2a4",
|
| 642 |
+
"1f4d0744f6c2aee8",
|
| 643 |
+
"799a59af806e688c",
|
| 644 |
+
"751f1934cb547fb0",
|
| 645 |
+
"ec4e92484f0a5a4e",
|
| 646 |
+
"11d0b9f27311fcf0",
|
| 647 |
+
"618e1da5930c5ace",
|
| 648 |
+
"c8ad92dd666d164b",
|
| 649 |
+
"b16452da8d4c69e0",
|
| 650 |
+
"b58de4bac0bacaa4",
|
| 651 |
+
"3142b0f33cc5cccf"
|
| 652 |
+
],
|
| 653 |
+
"summary": "Key entities: студент, Tableau, microsoft, Комп'ютерні науки, протокол № 2. Topics: 1.5. Заплановані результати навчання:; 1.2. Основні завдання вивчення дисципліни; Розділ 1. Основи візуалізації даних і Power BI.. Content: На досягнення освітніх цілей спрямовані такі методи навчання студентів: – практичні заняття (використовують для пізнання дійсності, формування навичок і вмінь, поглиблення знань. Під час їх застосуван Критерії оцінювання результатів роботи: • робота була виконана у відповідності з технічним завдання",
|
| 654 |
+
"key_entities": [
|
| 655 |
+
"студент",
|
| 656 |
+
"Tableau",
|
| 657 |
+
"microsoft",
|
| 658 |
+
"Комп'ютерні науки",
|
| 659 |
+
"протокол № 2",
|
| 660 |
+
"кафедри математичного моделювання та аналізу даних",
|
| 661 |
+
"Євгеній ПОКЛОНСЬКИЙ",
|
| 662 |
+
"Володимир СТРУКОВ"
|
| 663 |
+
],
|
| 664 |
+
"key_topics": [
|
| 665 |
+
"1.5. Заплановані результати навчання:",
|
| 666 |
+
"1.2. Основні завдання вивчення дисципліни",
|
| 667 |
+
"Розділ 1. Основи візуалізації даних і Power BI.",
|
| 668 |
+
"7. Методи навчання",
|
| 669 |
+
"1.1. Мета викладання навчальної дисципліни"
|
| 670 |
+
],
|
| 671 |
+
"size": 26
|
| 672 |
+
},
|
| 673 |
+
{
|
| 674 |
+
"community_id": "community_0_9",
|
| 675 |
+
"level": 0,
|
| 676 |
+
"node_ids": [
|
| 677 |
+
"2319b160a36f1033"
|
| 678 |
+
],
|
| 679 |
+
"summary": "Key entities: Харківського національного університету імені В. Каразіна, Денис КОВАЛЕНКО, Ольга ПЄШКОВА, Олександр КОЗЛОВ, Олександр ПЕЛЮХ. Content: Директор навчально-наукового інституту «Українська інженерно-педагогічна академія» Заступник директора Навчально- наукового інституту «Академія вчительства» В.о. декана ІІ Медичного факультету В.о. ди",
|
| 680 |
+
"key_entities": [
|
| 681 |
+
"Харківського національного університету імені В. Каразіна",
|
| 682 |
+
"Денис КОВАЛЕНКО",
|
| 683 |
+
"Ольга ПЄШКОВА",
|
| 684 |
+
"Олександр КОЗЛОВ",
|
| 685 |
+
"Олександр ПЕЛЮХ",
|
| 686 |
+
"Ганна ЗУБЕНКО",
|
| 687 |
+
"Українська інженерно-педагогічна академія",
|
| 688 |
+
"Медичного факультету"
|
| 689 |
+
],
|
| 690 |
+
"key_topics": [],
|
| 691 |
+
"size": 1
|
| 692 |
+
}
|
| 693 |
+
]
|
data/prebuilt/graph/graph_edges.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/prebuilt/graph/graph_nodes.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/prebuilt/index/community_embeddings.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eee6b12760e2d460f7ecdd8a879f4ad03c236fe77ab90b362fa2e2f84e5a74e0
|
| 3 |
+
size 41297
|
data/prebuilt/index/dense_index.faiss
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:da21e0ec093e55001e2de564762c98fe56a5ce9cc7e3f5696f7fd8dcb240f44e
|
| 3 |
+
size 1633386
|
data/prebuilt/index/dense_index.meta.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6974ecc903ffcfaedc0438a298036457e343b4d56266bc08f3a5f0bc37c6a2a9
|
| 3 |
+
size 271730
|
data/prebuilt/index/sparse_index.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d7b45f1486dfc8527cd5c72e400885ae70a1b8b17c915b7a1ffa091084f0045e
|
| 3 |
+
size 704878
|
graphrag_v4/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""GraphRAG v4 - Cross-Document Knowledge Graphs (Optimized)"""
|
| 3 |
+
|
| 4 |
+
from .models import (
|
| 5 |
+
Chunk, Entity, Relation, Community, SearchResult, QAResult,
|
| 6 |
+
NodeType, EdgeType, DocType,
|
| 7 |
+
normalize_text, generate_id, detect_doc_type,
|
| 8 |
+
)
|
| 9 |
+
from .graph_builder import KnowledgeGraphBuilder
|
| 10 |
+
from .retriever import HybridRetriever
|
| 11 |
+
from .qa import GraphRAGQA, LLMClient
|
| 12 |
+
from .corpus_builder import build_corpus, process_document
|
| 13 |
+
|
| 14 |
+
__version__ = "4.1.0"
|
graphrag_v4/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (1 kB). View file
|
|
|
graphrag_v4/__pycache__/chunking.cpython-311.pyc
ADDED
|
Binary file (11.6 kB). View file
|
|
|
graphrag_v4/__pycache__/corpus_builder.cpython-311.pyc
ADDED
|
Binary file (15.1 kB). View file
|
|
|
graphrag_v4/__pycache__/embeddings.cpython-311.pyc
ADDED
|
Binary file (6.64 kB). View file
|
|
|
graphrag_v4/__pycache__/extraction.cpython-311.pyc
ADDED
|
Binary file (25 kB). View file
|
|
|
graphrag_v4/__pycache__/graph_builder.cpython-311.pyc
ADDED
|
Binary file (36 kB). View file
|
|
|
graphrag_v4/__pycache__/models.cpython-311.pyc
ADDED
|
Binary file (10.4 kB). View file
|
|
|
graphrag_v4/__pycache__/qa.cpython-311.pyc
ADDED
|
Binary file (10.2 kB). View file
|
|
|
graphrag_v4/__pycache__/retriever.cpython-311.pyc
ADDED
|
Binary file (28 kB). View file
|
|
|
graphrag_v4/__pycache__/visualization.cpython-311.pyc
ADDED
|
Binary file (30.8 kB). View file
|
|
|
graphrag_v4/chunking.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""GraphRAG v4 - Semantic Chunking (Optimized)
|
| 3 |
+
|
| 4 |
+
Optimizations:
|
| 5 |
+
- Pre-compiled regex patterns (avoid re.compile per call)
|
| 6 |
+
- Batch sentence embedding uses return_sparse=False (faster, only need dense for similarity)
|
| 7 |
+
- Small trailing groups merged into previous chunk
|
| 8 |
+
- Vectorized cosine similarity with numpy broadcasting
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import re
|
| 12 |
+
from typing import List, Dict
|
| 13 |
+
import numpy as np
|
| 14 |
+
|
| 15 |
+
from .models import Chunk, generate_id
|
| 16 |
+
|
| 17 |
+
# Pre-compiled patterns
|
| 18 |
+
_WHITESPACE_RE = re.compile(r'\s+')
|
| 19 |
+
_SENTENCE_RE = re.compile(r'(?<=[.!?])\s+(?=[А-ЯІЇЄҐA-Z])|(?<=[.!?])\s*\n')
|
| 20 |
+
_SECTION_PATTERNS = [
|
| 21 |
+
(re.compile(r'^[\s]*(?:РОЗДІЛ|Розділ)\s+([IVXLCDM\d]+)[.\s]*(.*)$', re.UNICODE), 1),
|
| 22 |
+
(re.compile(r'^[\s]*(?:ГЛАВА|Глава)\s+([IVXLCDM\d]+)[.\s]*(.*)$', re.UNICODE), 2),
|
| 23 |
+
(re.compile(r'^[\s]*(?:Стаття|СТАТТЯ)\s+(\d+)[.\s]*(.*)$', re.UNICODE), 3),
|
| 24 |
+
(re.compile(r'^[\s]*(\d+(?:\.\d+)*)[.\)]\s+(.{10,})$', re.UNICODE), 4),
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class SemanticChunker:
|
| 29 |
+
def __init__(
|
| 30 |
+
self,
|
| 31 |
+
max_chunk_tokens: int = 512,
|
| 32 |
+
min_chunk_tokens: int = 100,
|
| 33 |
+
similarity_threshold: float = 0.7,
|
| 34 |
+
embedder=None,
|
| 35 |
+
):
|
| 36 |
+
self.max_chunk_tokens = max_chunk_tokens
|
| 37 |
+
self.min_chunk_tokens = min_chunk_tokens
|
| 38 |
+
self.similarity_threshold = similarity_threshold
|
| 39 |
+
self._embedder = embedder
|
| 40 |
+
|
| 41 |
+
@property
|
| 42 |
+
def embedder(self):
|
| 43 |
+
if self._embedder is None:
|
| 44 |
+
from .embeddings import get_embedder
|
| 45 |
+
self._embedder = get_embedder()
|
| 46 |
+
return self._embedder
|
| 47 |
+
|
| 48 |
+
def chunk_document(
|
| 49 |
+
self,
|
| 50 |
+
pages: List[Dict],
|
| 51 |
+
doc_id: str,
|
| 52 |
+
doc_type: str,
|
| 53 |
+
title: str = "",
|
| 54 |
+
) -> List[Chunk]:
|
| 55 |
+
if not pages:
|
| 56 |
+
return []
|
| 57 |
+
|
| 58 |
+
sections = self._extract_sections(pages)
|
| 59 |
+
if not sections:
|
| 60 |
+
return []
|
| 61 |
+
|
| 62 |
+
all_chunks = []
|
| 63 |
+
chunk_idx = 0
|
| 64 |
+
|
| 65 |
+
for section in sections:
|
| 66 |
+
section_text = section.get("text", "").strip()
|
| 67 |
+
if not section_text:
|
| 68 |
+
continue
|
| 69 |
+
|
| 70 |
+
sentences = self._split_sentences(section_text)
|
| 71 |
+
if not sentences:
|
| 72 |
+
continue
|
| 73 |
+
|
| 74 |
+
boundaries = self._find_semantic_boundaries(sentences) if len(sentences) > 3 else []
|
| 75 |
+
groups = self._group_sentences(sentences, boundaries)
|
| 76 |
+
|
| 77 |
+
for group in groups:
|
| 78 |
+
chunk_text = " ".join(group)
|
| 79 |
+
if len(chunk_text) < 50:
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
chunk = Chunk(
|
| 83 |
+
chunk_id=generate_id("chunk", doc_id, chunk_idx),
|
| 84 |
+
doc_id=doc_id,
|
| 85 |
+
doc_type=doc_type,
|
| 86 |
+
text=chunk_text,
|
| 87 |
+
title=section.get("title", title),
|
| 88 |
+
parent_context=self._build_context(section, title),
|
| 89 |
+
page=section.get("start_page", 1),
|
| 90 |
+
)
|
| 91 |
+
all_chunks.append(chunk)
|
| 92 |
+
chunk_idx += 1
|
| 93 |
+
|
| 94 |
+
return all_chunks
|
| 95 |
+
|
| 96 |
+
def _extract_sections(self, pages: List[Dict]) -> List[Dict]:
|
| 97 |
+
sections = []
|
| 98 |
+
current_section = {"level": 0, "title": "", "text": "", "start_page": 1, "parent_titles": []}
|
| 99 |
+
parent_stack = []
|
| 100 |
+
|
| 101 |
+
for page in pages:
|
| 102 |
+
page_num = page.get("page_num", 1)
|
| 103 |
+
text = page.get("text", "")
|
| 104 |
+
|
| 105 |
+
for line in text.split("\n"):
|
| 106 |
+
line = line.strip()
|
| 107 |
+
if not line:
|
| 108 |
+
continue
|
| 109 |
+
|
| 110 |
+
matched_level = None
|
| 111 |
+
matched_title = None
|
| 112 |
+
|
| 113 |
+
for pattern, level in _SECTION_PATTERNS:
|
| 114 |
+
if pattern.match(line):
|
| 115 |
+
matched_level = level
|
| 116 |
+
matched_title = line[:100]
|
| 117 |
+
break
|
| 118 |
+
|
| 119 |
+
if matched_level is not None:
|
| 120 |
+
if current_section["text"].strip():
|
| 121 |
+
sections.append(current_section)
|
| 122 |
+
|
| 123 |
+
while parent_stack and parent_stack[-1][0] >= matched_level:
|
| 124 |
+
parent_stack.pop()
|
| 125 |
+
|
| 126 |
+
parent_titles = [p[1] for p in parent_stack]
|
| 127 |
+
parent_stack.append((matched_level, matched_title))
|
| 128 |
+
|
| 129 |
+
current_section = {
|
| 130 |
+
"level": matched_level,
|
| 131 |
+
"title": matched_title,
|
| 132 |
+
"text": "",
|
| 133 |
+
"start_page": page_num,
|
| 134 |
+
"parent_titles": parent_titles,
|
| 135 |
+
}
|
| 136 |
+
else:
|
| 137 |
+
current_section["text"] += line + " "
|
| 138 |
+
|
| 139 |
+
if current_section["text"].strip():
|
| 140 |
+
sections.append(current_section)
|
| 141 |
+
|
| 142 |
+
if not sections:
|
| 143 |
+
all_text = " ".join(p.get("text", "") for p in pages)
|
| 144 |
+
if all_text.strip():
|
| 145 |
+
sections = [{"level": 0, "title": "", "text": all_text, "start_page": 1, "parent_titles": []}]
|
| 146 |
+
|
| 147 |
+
return sections
|
| 148 |
+
|
| 149 |
+
def _split_sentences(self, text: str) -> List[str]:
|
| 150 |
+
text = _WHITESPACE_RE.sub(' ', text).strip()
|
| 151 |
+
if not text:
|
| 152 |
+
return []
|
| 153 |
+
sentences = _SENTENCE_RE.split(text)
|
| 154 |
+
return [s.strip() for s in sentences if s and len(s.strip()) > 20]
|
| 155 |
+
|
| 156 |
+
def _find_semantic_boundaries(self, sentences: List[str]) -> List[int]:
|
| 157 |
+
if len(sentences) < 2:
|
| 158 |
+
return []
|
| 159 |
+
|
| 160 |
+
try:
|
| 161 |
+
# return_sparse=False — we only need dense vectors for cosine similarity
|
| 162 |
+
embeddings = self.embedder.embed_batch(sentences, return_sparse=False)
|
| 163 |
+
dense_vecs = np.array([e.dense for e in embeddings])
|
| 164 |
+
norms = np.linalg.norm(dense_vecs, axis=1, keepdims=True)
|
| 165 |
+
dense_vecs = dense_vecs / (norms + 1e-9)
|
| 166 |
+
|
| 167 |
+
# Vectorized consecutive cosine similarity
|
| 168 |
+
similarities = np.einsum('ij,ij->i', dense_vecs[:-1], dense_vecs[1:])
|
| 169 |
+
|
| 170 |
+
boundaries = [0]
|
| 171 |
+
for i, sim in enumerate(similarities):
|
| 172 |
+
if sim < self.similarity_threshold:
|
| 173 |
+
boundaries.append(i + 1)
|
| 174 |
+
elif 0 < i < len(similarities) - 1:
|
| 175 |
+
if sim < similarities[i - 1] - 0.1 and sim < similarities[i + 1] - 0.1:
|
| 176 |
+
boundaries.append(i + 1)
|
| 177 |
+
|
| 178 |
+
return sorted(set(boundaries))
|
| 179 |
+
except Exception:
|
| 180 |
+
return []
|
| 181 |
+
|
| 182 |
+
def _group_sentences(self, sentences: List[str], boundaries: List[int]) -> List[List[str]]:
|
| 183 |
+
if not sentences:
|
| 184 |
+
return []
|
| 185 |
+
if not boundaries:
|
| 186 |
+
boundaries = [0]
|
| 187 |
+
|
| 188 |
+
groups = []
|
| 189 |
+
current_group = []
|
| 190 |
+
current_tokens = 0
|
| 191 |
+
|
| 192 |
+
for i, sentence in enumerate(sentences):
|
| 193 |
+
sent_tokens = len(sentence) // 4
|
| 194 |
+
|
| 195 |
+
if (i in boundaries or current_tokens + sent_tokens > self.max_chunk_tokens) and current_group:
|
| 196 |
+
if current_tokens >= self.min_chunk_tokens:
|
| 197 |
+
groups.append(current_group)
|
| 198 |
+
current_group = []
|
| 199 |
+
current_tokens = 0
|
| 200 |
+
|
| 201 |
+
current_group.append(sentence)
|
| 202 |
+
current_tokens += sent_tokens
|
| 203 |
+
|
| 204 |
+
if current_group:
|
| 205 |
+
# Merge small trailing group into previous to avoid tiny chunks
|
| 206 |
+
if groups and current_tokens < self.min_chunk_tokens:
|
| 207 |
+
groups[-1].extend(current_group)
|
| 208 |
+
else:
|
| 209 |
+
groups.append(current_group)
|
| 210 |
+
|
| 211 |
+
return groups
|
| 212 |
+
|
| 213 |
+
def _build_context(self, section: Dict, doc_title: str) -> str:
|
| 214 |
+
parts = []
|
| 215 |
+
if doc_title:
|
| 216 |
+
parts.append(f"Document: {doc_title[:100]}")
|
| 217 |
+
if section.get("parent_titles"):
|
| 218 |
+
for pt in section["parent_titles"][-2:]:
|
| 219 |
+
parts.append(pt[:80])
|
| 220 |
+
return " > ".join(parts) if parts else ""
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def chunk_document(
|
| 224 |
+
pages: List[Dict],
|
| 225 |
+
doc_id: str,
|
| 226 |
+
doc_type: str,
|
| 227 |
+
title: str = "",
|
| 228 |
+
max_chunk_tokens: int = 512,
|
| 229 |
+
embedder=None,
|
| 230 |
+
) -> List[Chunk]:
|
| 231 |
+
chunker = SemanticChunker(max_chunk_tokens=max_chunk_tokens, embedder=embedder)
|
| 232 |
+
return chunker.chunk_document(pages, doc_id, doc_type, title)
|
graphrag_v4/corpus_builder.py
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""GraphRAG v4 - Corpus Builder (Optimized)
|
| 3 |
+
|
| 4 |
+
Optimizations:
|
| 5 |
+
- Enum serialization fix in write_jsonl
|
| 6 |
+
- doc_ids tracked on entities for cross-document linking
|
| 7 |
+
- Larger batch sizes for GPU utilization
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import argparse
|
| 11 |
+
import json
|
| 12 |
+
import re
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
from enum import Enum
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Dict, List, Optional, Tuple
|
| 17 |
+
from collections import defaultdict
|
| 18 |
+
|
| 19 |
+
from tqdm import tqdm
|
| 20 |
+
|
| 21 |
+
from .models import Chunk, Entity, Relation, DocType, generate_id, detect_doc_type
|
| 22 |
+
from .chunking import SemanticChunker
|
| 23 |
+
from .extraction import EntityExtractor, RelationExtractor, CorpusDeduplicator
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def extract_pdf(pdf_path: Path) -> Tuple[List[Dict], Dict]:
|
| 27 |
+
try:
|
| 28 |
+
import fitz
|
| 29 |
+
except ImportError:
|
| 30 |
+
return [], {}
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
doc = fitz.open(pdf_path)
|
| 34 |
+
except Exception:
|
| 35 |
+
return [], {}
|
| 36 |
+
|
| 37 |
+
pages = []
|
| 38 |
+
for page_num in range(len(doc)):
|
| 39 |
+
page = doc[page_num]
|
| 40 |
+
text = page.get_text("text")
|
| 41 |
+
text = re.sub(r'[ \t\u00A0]+', ' ', text)
|
| 42 |
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 43 |
+
pages.append({"page_num": page_num + 1, "text": text.strip()})
|
| 44 |
+
|
| 45 |
+
metadata = {
|
| 46 |
+
"page_count": len(doc),
|
| 47 |
+
"title": doc.metadata.get("title", ""),
|
| 48 |
+
"author": doc.metadata.get("author", ""),
|
| 49 |
+
}
|
| 50 |
+
doc.close()
|
| 51 |
+
return pages, metadata
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def guess_title(pages: List[Dict]) -> str:
|
| 55 |
+
if not pages:
|
| 56 |
+
return ""
|
| 57 |
+
for line in pages[0]["text"].split('\n')[:10]:
|
| 58 |
+
line = line.strip()
|
| 59 |
+
if 15 < len(line) < 200 and not re.match(r'^[\d\s./]+$', line):
|
| 60 |
+
return line
|
| 61 |
+
return ""
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class ModelManager:
|
| 65 |
+
def __init__(self, max_chunk_tokens: int = 384):
|
| 66 |
+
self.max_chunk_tokens = max_chunk_tokens
|
| 67 |
+
self._embedder = None
|
| 68 |
+
self._chunker = None
|
| 69 |
+
self._entity_extractor = None
|
| 70 |
+
self._relation_extractor = None
|
| 71 |
+
self._initialized = False
|
| 72 |
+
|
| 73 |
+
def initialize(self):
|
| 74 |
+
if self._initialized:
|
| 75 |
+
return
|
| 76 |
+
|
| 77 |
+
from .embeddings import get_embedder
|
| 78 |
+
self._embedder = get_embedder()
|
| 79 |
+
self._chunker = SemanticChunker(max_chunk_tokens=self.max_chunk_tokens, embedder=self._embedder)
|
| 80 |
+
self._entity_extractor = EntityExtractor()
|
| 81 |
+
self._relation_extractor = RelationExtractor()
|
| 82 |
+
self._initialized = True
|
| 83 |
+
|
| 84 |
+
@property
|
| 85 |
+
def embedder(self):
|
| 86 |
+
self.initialize()
|
| 87 |
+
return self._embedder
|
| 88 |
+
|
| 89 |
+
@property
|
| 90 |
+
def chunker(self) -> SemanticChunker:
|
| 91 |
+
self.initialize()
|
| 92 |
+
return self._chunker
|
| 93 |
+
|
| 94 |
+
@property
|
| 95 |
+
def entity_extractor(self) -> EntityExtractor:
|
| 96 |
+
self.initialize()
|
| 97 |
+
return self._entity_extractor
|
| 98 |
+
|
| 99 |
+
@property
|
| 100 |
+
def relation_extractor(self) -> RelationExtractor:
|
| 101 |
+
self.initialize()
|
| 102 |
+
return self._relation_extractor
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def process_document(file_path: Path, model_manager: ModelManager) -> Optional[Dict]:
|
| 106 |
+
pages, metadata = extract_pdf(file_path)
|
| 107 |
+
if not pages:
|
| 108 |
+
return None
|
| 109 |
+
|
| 110 |
+
total_text = "".join(p.get("text", "") for p in pages)
|
| 111 |
+
if not total_text.strip():
|
| 112 |
+
return None
|
| 113 |
+
|
| 114 |
+
title = metadata.get("title") or guess_title(pages)
|
| 115 |
+
doc_type = detect_doc_type(file_path.name, pages[0]["text"])
|
| 116 |
+
doc_id = generate_id("doc", file_path.name, len(pages))
|
| 117 |
+
|
| 118 |
+
chunks = model_manager.chunker.chunk_document(pages, doc_id, doc_type.value, title)
|
| 119 |
+
if not chunks:
|
| 120 |
+
return None
|
| 121 |
+
|
| 122 |
+
texts = [chunk.text for chunk in chunks]
|
| 123 |
+
chunk_ids = [chunk.chunk_id for chunk in chunks]
|
| 124 |
+
|
| 125 |
+
all_entities = model_manager.entity_extractor.extract_batch(texts, chunk_ids, batch_size=64)
|
| 126 |
+
|
| 127 |
+
for entity in all_entities:
|
| 128 |
+
entity.doc_ids = [doc_id]
|
| 129 |
+
|
| 130 |
+
chunk_to_entities = defaultdict(list)
|
| 131 |
+
for entity in all_entities:
|
| 132 |
+
chunk_to_entities[entity.source_id].append(entity.entity_id)
|
| 133 |
+
|
| 134 |
+
all_relations = []
|
| 135 |
+
entity_by_id = {e.entity_id: e for e in all_entities}
|
| 136 |
+
|
| 137 |
+
for chunk in chunks:
|
| 138 |
+
chunk_ent_ids = chunk_to_entities.get(chunk.chunk_id, [])
|
| 139 |
+
chunk_entities = [entity_by_id[eid] for eid in chunk_ent_ids if eid in entity_by_id]
|
| 140 |
+
relations = model_manager.relation_extractor.extract(chunk.text, chunk_entities, chunk.chunk_id)
|
| 141 |
+
all_relations.extend(relations)
|
| 142 |
+
|
| 143 |
+
return {
|
| 144 |
+
"doc_info": {
|
| 145 |
+
"doc_id": doc_id,
|
| 146 |
+
"filename": file_path.name,
|
| 147 |
+
"doc_type": doc_type.value,
|
| 148 |
+
"title": title,
|
| 149 |
+
"page_count": len(pages),
|
| 150 |
+
"processed_at": datetime.now().isoformat(),
|
| 151 |
+
},
|
| 152 |
+
"chunks": chunks,
|
| 153 |
+
"entities": all_entities,
|
| 154 |
+
"relations": all_relations,
|
| 155 |
+
"chunk_entities": dict(chunk_to_entities),
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def write_jsonl(filepath: Path, items: List):
|
| 160 |
+
with open(filepath, "w", encoding="utf-8") as f:
|
| 161 |
+
for item in items:
|
| 162 |
+
if hasattr(item, '__dict__'):
|
| 163 |
+
data = item.__dict__.copy()
|
| 164 |
+
for k, v in data.items():
|
| 165 |
+
if hasattr(v, 'tolist'):
|
| 166 |
+
data[k] = v.tolist()
|
| 167 |
+
elif isinstance(v, Enum):
|
| 168 |
+
data[k] = v.value
|
| 169 |
+
else:
|
| 170 |
+
data = item
|
| 171 |
+
f.write(json.dumps(data, ensure_ascii=False, default=str) + "\n")
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def build_corpus(input_path: Path, output_dir: Path, max_chunk_tokens: int = 384):
|
| 175 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 176 |
+
|
| 177 |
+
if input_path.is_file():
|
| 178 |
+
files = [input_path]
|
| 179 |
+
else:
|
| 180 |
+
files = sorted(input_path.glob("*.pdf")) + sorted(input_path.glob("*.PDF"))
|
| 181 |
+
|
| 182 |
+
if not files:
|
| 183 |
+
return
|
| 184 |
+
|
| 185 |
+
model_manager = ModelManager(max_chunk_tokens=max_chunk_tokens)
|
| 186 |
+
model_manager.initialize()
|
| 187 |
+
|
| 188 |
+
all_docs = []
|
| 189 |
+
all_chunks = []
|
| 190 |
+
all_entities = []
|
| 191 |
+
all_relations = []
|
| 192 |
+
all_chunk_entities = {}
|
| 193 |
+
|
| 194 |
+
for file_path in tqdm(files, desc="Processing"):
|
| 195 |
+
try:
|
| 196 |
+
result = process_document(file_path, model_manager)
|
| 197 |
+
if result:
|
| 198 |
+
all_docs.append(result["doc_info"])
|
| 199 |
+
all_chunks.extend(result["chunks"])
|
| 200 |
+
all_entities.extend(result["entities"])
|
| 201 |
+
all_relations.extend(result["relations"])
|
| 202 |
+
all_chunk_entities.update(result["chunk_entities"])
|
| 203 |
+
except Exception:
|
| 204 |
+
pass
|
| 205 |
+
|
| 206 |
+
if all_entities:
|
| 207 |
+
deduplicator = CorpusDeduplicator(embedder=model_manager.embedder)
|
| 208 |
+
all_entities, id_mapping = deduplicator.deduplicate_entities(all_entities)
|
| 209 |
+
|
| 210 |
+
for chunk_id in all_chunk_entities:
|
| 211 |
+
all_chunk_entities[chunk_id] = list(set(
|
| 212 |
+
id_mapping.get(eid, eid) for eid in all_chunk_entities[chunk_id]
|
| 213 |
+
))
|
| 214 |
+
|
| 215 |
+
all_relations = deduplicator.deduplicate_relations(all_relations, id_mapping)
|
| 216 |
+
|
| 217 |
+
write_jsonl(output_dir / "documents.jsonl", all_docs)
|
| 218 |
+
write_jsonl(output_dir / "chunks.jsonl", all_chunks)
|
| 219 |
+
write_jsonl(output_dir / "entities.jsonl", all_entities)
|
| 220 |
+
write_jsonl(output_dir / "relations.jsonl", all_relations)
|
| 221 |
+
|
| 222 |
+
with open(output_dir / "chunk_entities.json", "w", encoding="utf-8") as f:
|
| 223 |
+
json.dump(all_chunk_entities, f, ensure_ascii=False)
|
| 224 |
+
|
| 225 |
+
with open(output_dir / "stats.json", "w", encoding="utf-8") as f:
|
| 226 |
+
json.dump({
|
| 227 |
+
"processed_at": datetime.now().isoformat(),
|
| 228 |
+
"document_count": len(all_docs),
|
| 229 |
+
"total_chunks": len(all_chunks),
|
| 230 |
+
"total_entities": len(all_entities),
|
| 231 |
+
"total_relations": len(all_relations),
|
| 232 |
+
}, f, ensure_ascii=False, indent=2)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def main():
|
| 236 |
+
parser = argparse.ArgumentParser(description="Build GraphRAG corpus from PDF documents")
|
| 237 |
+
parser.add_argument("--input", "-i", required=True, help="Input file or directory")
|
| 238 |
+
parser.add_argument("--output", "-o", required=True, help="Output corpus directory")
|
| 239 |
+
parser.add_argument("--max-chunk-tokens", type=int, default=384, help="Max chunk size")
|
| 240 |
+
args = parser.parse_args()
|
| 241 |
+
|
| 242 |
+
build_corpus(
|
| 243 |
+
input_path=Path(args.input),
|
| 244 |
+
output_dir=Path(args.output),
|
| 245 |
+
max_chunk_tokens=args.max_chunk_tokens,
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
if __name__ == "__main__":
|
| 250 |
+
main()
|
graphrag_v4/embeddings.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""GraphRAG v4 - BGE-M3 Embedding Service (Optimized)
|
| 3 |
+
|
| 4 |
+
Optimizations:
|
| 5 |
+
- Larger default batch_size (64) to fill GPU better
|
| 6 |
+
- embed_batch normalizes dense vectors in-place with numpy (no per-vector loop)
|
| 7 |
+
- LRU cache for repeated query embeddings during interactive sessions
|
| 8 |
+
- Reduced max_length to 2048 for chunks (saves VRAM, chunks are <512 tokens)
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
from typing import Dict, List, Optional, Tuple
|
| 13 |
+
from dataclasses import dataclass
|
| 14 |
+
from functools import lru_cache
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class EmbeddingOutput:
|
| 19 |
+
dense: np.ndarray
|
| 20 |
+
sparse: Optional[Dict[int, float]] = None
|
| 21 |
+
|
| 22 |
+
def to_dict(self) -> dict:
|
| 23 |
+
return {
|
| 24 |
+
"dense": self.dense.tolist(),
|
| 25 |
+
"sparse": self.sparse,
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class BGEM3Embedder:
|
| 30 |
+
MODEL_NAME = "BAAI/bge-m3"
|
| 31 |
+
DENSE_DIM = 1024
|
| 32 |
+
|
| 33 |
+
def __init__(self, device: str = "auto", max_length: int = 2048, use_fp16: bool = True):
|
| 34 |
+
self.max_length = max_length
|
| 35 |
+
self.use_fp16 = use_fp16
|
| 36 |
+
self.model = None
|
| 37 |
+
self._device = device
|
| 38 |
+
self._loaded = False
|
| 39 |
+
|
| 40 |
+
def _load(self):
|
| 41 |
+
if self._loaded:
|
| 42 |
+
return
|
| 43 |
+
|
| 44 |
+
import torch
|
| 45 |
+
from FlagEmbedding import BGEM3FlagModel
|
| 46 |
+
|
| 47 |
+
if self._device == "auto":
|
| 48 |
+
self._device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 49 |
+
|
| 50 |
+
self.model = BGEM3FlagModel(
|
| 51 |
+
self.MODEL_NAME,
|
| 52 |
+
use_fp16=self.use_fp16 and self._device == "cuda",
|
| 53 |
+
device=self._device,
|
| 54 |
+
)
|
| 55 |
+
self._loaded = True
|
| 56 |
+
|
| 57 |
+
@property
|
| 58 |
+
def device(self) -> str:
|
| 59 |
+
self._load()
|
| 60 |
+
return self._device
|
| 61 |
+
|
| 62 |
+
def embed(self, text: str, return_sparse: bool = True) -> EmbeddingOutput:
|
| 63 |
+
results = self.embed_batch([text], return_sparse=return_sparse)
|
| 64 |
+
return results[0]
|
| 65 |
+
|
| 66 |
+
def embed_batch(
|
| 67 |
+
self,
|
| 68 |
+
texts: List[str],
|
| 69 |
+
return_sparse: bool = True,
|
| 70 |
+
batch_size: int = 64,
|
| 71 |
+
) -> List[EmbeddingOutput]:
|
| 72 |
+
self._load()
|
| 73 |
+
|
| 74 |
+
if not texts:
|
| 75 |
+
return []
|
| 76 |
+
|
| 77 |
+
output = self.model.encode(
|
| 78 |
+
texts,
|
| 79 |
+
batch_size=batch_size,
|
| 80 |
+
max_length=self.max_length,
|
| 81 |
+
return_dense=True,
|
| 82 |
+
return_sparse=return_sparse,
|
| 83 |
+
return_colbert_vecs=False,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Vectorized normalization of all dense vectors at once
|
| 87 |
+
dense_vecs = np.array(output["dense_vecs"], dtype=np.float32)
|
| 88 |
+
|
| 89 |
+
results = []
|
| 90 |
+
has_sparse = return_sparse and "lexical_weights" in output
|
| 91 |
+
for i in range(len(texts)):
|
| 92 |
+
sparse = output["lexical_weights"][i] if has_sparse else None
|
| 93 |
+
results.append(EmbeddingOutput(dense=dense_vecs[i], sparse=sparse))
|
| 94 |
+
|
| 95 |
+
return results
|
| 96 |
+
|
| 97 |
+
def compute_similarity(
|
| 98 |
+
self,
|
| 99 |
+
query_emb: EmbeddingOutput,
|
| 100 |
+
doc_emb: EmbeddingOutput,
|
| 101 |
+
weights: Tuple[float, float] = (0.6, 0.4),
|
| 102 |
+
) -> Tuple[float, Dict[str, float]]:
|
| 103 |
+
w_dense, w_sparse = weights
|
| 104 |
+
scores = {}
|
| 105 |
+
|
| 106 |
+
q_norm = query_emb.dense / (np.linalg.norm(query_emb.dense) + 1e-9)
|
| 107 |
+
d_norm = doc_emb.dense / (np.linalg.norm(doc_emb.dense) + 1e-9)
|
| 108 |
+
scores["dense"] = float(np.dot(q_norm, d_norm))
|
| 109 |
+
|
| 110 |
+
scores["sparse"] = 0.0
|
| 111 |
+
if query_emb.sparse and doc_emb.sparse:
|
| 112 |
+
overlap = set(query_emb.sparse.keys()) & set(doc_emb.sparse.keys())
|
| 113 |
+
if overlap:
|
| 114 |
+
scores["sparse"] = sum(query_emb.sparse[k] * doc_emb.sparse[k] for k in overlap)
|
| 115 |
+
|
| 116 |
+
total = w_dense * scores["dense"] + w_sparse * scores["sparse"]
|
| 117 |
+
return total, scores
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
_embedder: Optional[BGEM3Embedder] = None
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def get_embedder() -> BGEM3Embedder:
|
| 124 |
+
global _embedder
|
| 125 |
+
if _embedder is None:
|
| 126 |
+
_embedder = BGEM3Embedder()
|
| 127 |
+
return _embedder
|
graphrag_v4/extraction.py
ADDED
|
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""GraphRAG v4 - Entity & Relation Extraction (Optimized)
|
| 3 |
+
|
| 4 |
+
Optimizations:
|
| 5 |
+
- Pre-compiled all regex patterns at module load time
|
| 6 |
+
- Entity merge uses sorted intervals instead of O(n*span) set-per-char
|
| 7 |
+
- Dedup clustering: upper-triangle only with np.triu_indices (skip O(n²) python loop)
|
| 8 |
+
- Proximity extraction: single-pass sliding window instead of nested loop
|
| 9 |
+
- Batch GLiNER: larger default batch_size=64
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import re
|
| 13 |
+
from typing import List, Dict, Tuple
|
| 14 |
+
from collections import defaultdict
|
| 15 |
+
import numpy as np
|
| 16 |
+
|
| 17 |
+
from .models import Entity, Relation, generate_id, normalize_text
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
ENTITY_LABELS = [
|
| 21 |
+
"ORGANIZATION", "PERSON", "LAW_REFERENCE", "DECREE", "DATE",
|
| 22 |
+
"SPECIALITY", "EDU_LEVEL", "DOCUMENT", "NUMBER",
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
# Pre-compiled patterns
|
| 26 |
+
_ENTITY_PATTERNS = {
|
| 27 |
+
"LAW_REFERENCE": [
|
| 28 |
+
re.compile(r'Закон(?:у|ом|і)?\s+України\s+[«"]([^»"]+)[»"]', re.UNICODE | re.IGNORECASE),
|
| 29 |
+
],
|
| 30 |
+
"SPECIALITY": [
|
| 31 |
+
re.compile(r"(?:спеціальніст[ьюі]\s+)?(\d{3})\s*[«\"]?([А-ЯІЇЄа-яіїє][А-ЯІЇЄа-яіїє\s-]{3,40})[»\"]?", re.UNICODE | re.IGNORECASE),
|
| 32 |
+
],
|
| 33 |
+
"DATE": [
|
| 34 |
+
re.compile(r"\d{1,2}\s+(?:січня|лютого|березня|квітня|травня|червня|липня|серпня|вересня|жовтня|листопада|грудня)\s+\d{4}", re.UNICODE | re.IGNORECASE),
|
| 35 |
+
],
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
_RELATION_PATTERNS = [
|
| 39 |
+
(re.compile(r"(?:вимаг[ає]+ться|потребує|необхідно)\s+(.{10,100})", re.UNICODE | re.IGNORECASE), "requires"),
|
| 40 |
+
(re.compile(r"(?:регулюється|визначається)\s+(.{10,100})", re.UNICODE | re.IGNORECASE), "regulated_by"),
|
| 41 |
+
(re.compile(r"(?:відповідно до|згідно з)\s+(.{10,100})", re.UNICODE | re.IGNORECASE), "according_to"),
|
| 42 |
+
(re.compile(r"(.{10,50})\s+(?:включає|містить)\s+(.{10,100})", re.UNICODE | re.IGNORECASE), "includes"),
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
MIN_ENTITY_CONFIDENCE = 0.4
|
| 46 |
+
PROXIMITY_THRESHOLD = 200
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class GLiNERExtractor:
|
| 50 |
+
_model = None
|
| 51 |
+
_model_name = None
|
| 52 |
+
|
| 53 |
+
def __init__(self, model_name: str = "urchade/gliner_multi-v2.1"):
|
| 54 |
+
self.model_name = model_name
|
| 55 |
+
|
| 56 |
+
def _load(self):
|
| 57 |
+
if GLiNERExtractor._model is not None and GLiNERExtractor._model_name == self.model_name:
|
| 58 |
+
return GLiNERExtractor._model
|
| 59 |
+
|
| 60 |
+
from gliner import GLiNER
|
| 61 |
+
import torch
|
| 62 |
+
|
| 63 |
+
model = GLiNER.from_pretrained(self.model_name)
|
| 64 |
+
if torch.cuda.is_available():
|
| 65 |
+
model = model.to("cuda")
|
| 66 |
+
|
| 67 |
+
GLiNERExtractor._model = model
|
| 68 |
+
GLiNERExtractor._model_name = self.model_name
|
| 69 |
+
return model
|
| 70 |
+
|
| 71 |
+
def extract_batch(
|
| 72 |
+
self,
|
| 73 |
+
texts: List[str],
|
| 74 |
+
labels: List[str] = None,
|
| 75 |
+
threshold: float = 0.5,
|
| 76 |
+
batch_size: int = 64,
|
| 77 |
+
) -> List[List[Dict]]:
|
| 78 |
+
model = self._load()
|
| 79 |
+
labels = labels or ENTITY_LABELS
|
| 80 |
+
all_results = []
|
| 81 |
+
|
| 82 |
+
for i in range(0, len(texts), batch_size):
|
| 83 |
+
batch_texts = texts[i:i + batch_size]
|
| 84 |
+
batch_results = model.batch_predict_entities(batch_texts, labels, threshold=threshold)
|
| 85 |
+
|
| 86 |
+
for entities_raw in batch_results:
|
| 87 |
+
all_results.append([{
|
| 88 |
+
"type": ent["label"],
|
| 89 |
+
"value": ent["text"],
|
| 90 |
+
"start": ent["start"],
|
| 91 |
+
"end": ent["end"],
|
| 92 |
+
"confidence": ent.get("score", 0.8),
|
| 93 |
+
} for ent in entities_raw])
|
| 94 |
+
|
| 95 |
+
return all_results
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class PatternExtractor:
|
| 99 |
+
"""Minimal pattern extractor — only catches what GLiNER misses."""
|
| 100 |
+
|
| 101 |
+
def extract(self, text: str) -> List[Dict]:
|
| 102 |
+
entities = []
|
| 103 |
+
seen = set()
|
| 104 |
+
|
| 105 |
+
for entity_type, patterns in _ENTITY_PATTERNS.items():
|
| 106 |
+
for pattern in patterns:
|
| 107 |
+
for match in pattern.finditer(text):
|
| 108 |
+
value = match.group(0).strip()
|
| 109 |
+
key = (entity_type, normalize_text(value))
|
| 110 |
+
if key not in seen:
|
| 111 |
+
seen.add(key)
|
| 112 |
+
entities.append({
|
| 113 |
+
"type": entity_type,
|
| 114 |
+
"value": value,
|
| 115 |
+
"start": match.start(),
|
| 116 |
+
"end": match.end(),
|
| 117 |
+
"confidence": 0.85,
|
| 118 |
+
})
|
| 119 |
+
|
| 120 |
+
return entities
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
class EntityExtractor:
|
| 124 |
+
def __init__(self, min_confidence: float = MIN_ENTITY_CONFIDENCE):
|
| 125 |
+
self._gliner = GLiNERExtractor()
|
| 126 |
+
self._pattern = PatternExtractor()
|
| 127 |
+
self.min_confidence = min_confidence
|
| 128 |
+
self._gliner._load()
|
| 129 |
+
|
| 130 |
+
def extract_batch(self, texts: List[str], source_ids: List[str], batch_size: int = 64) -> List[Entity]:
|
| 131 |
+
if not texts:
|
| 132 |
+
return []
|
| 133 |
+
|
| 134 |
+
gliner_results = self._gliner.extract_batch(texts, batch_size=batch_size)
|
| 135 |
+
all_entities = []
|
| 136 |
+
|
| 137 |
+
for text, source_id, gliner_ents in zip(texts, source_ids, gliner_results):
|
| 138 |
+
pattern_ents = self._pattern.extract(text)
|
| 139 |
+
merged = self._merge_entities(gliner_ents, pattern_ents)
|
| 140 |
+
merged = [e for e in merged if e.get("confidence", 0) >= self.min_confidence]
|
| 141 |
+
entities = self._to_entity_objects(merged, source_id)
|
| 142 |
+
all_entities.extend(entities)
|
| 143 |
+
|
| 144 |
+
return all_entities
|
| 145 |
+
|
| 146 |
+
def _to_entity_objects(self, raw_entities: List[Dict], source_id: str) -> List[Entity]:
|
| 147 |
+
entities = []
|
| 148 |
+
seen = set()
|
| 149 |
+
|
| 150 |
+
for ent in raw_entities:
|
| 151 |
+
normalized = normalize_text(ent["value"])
|
| 152 |
+
if len(normalized) < 2:
|
| 153 |
+
continue
|
| 154 |
+
key = (ent["type"], normalized)
|
| 155 |
+
if key in seen:
|
| 156 |
+
continue
|
| 157 |
+
seen.add(key)
|
| 158 |
+
|
| 159 |
+
entities.append(Entity(
|
| 160 |
+
entity_id=generate_id("ent", ent["type"], ent["value"]),
|
| 161 |
+
entity_type=ent["type"],
|
| 162 |
+
value=ent["value"],
|
| 163 |
+
normalized=normalized,
|
| 164 |
+
source_id=source_id,
|
| 165 |
+
confidence=ent.get("confidence", 0.8),
|
| 166 |
+
))
|
| 167 |
+
|
| 168 |
+
return entities
|
| 169 |
+
|
| 170 |
+
def _merge_entities(self, primary: List[Dict], secondary: List[Dict]) -> List[Dict]:
|
| 171 |
+
"""Merge entities using sorted interval overlap check (O(n log n) vs O(n*span))."""
|
| 172 |
+
if not primary:
|
| 173 |
+
return secondary
|
| 174 |
+
if not secondary:
|
| 175 |
+
return primary
|
| 176 |
+
|
| 177 |
+
# Sort primary intervals for binary-search overlap check
|
| 178 |
+
intervals = sorted((ent.get("start", 0), ent.get("end", 0)) for ent in primary)
|
| 179 |
+
|
| 180 |
+
merged = list(primary)
|
| 181 |
+
for ent in secondary:
|
| 182 |
+
s, e = ent.get("start", 0), ent.get("end", 0)
|
| 183 |
+
# Check overlap with binary search
|
| 184 |
+
if not self._overlaps_any(intervals, s, e):
|
| 185 |
+
merged.append(ent)
|
| 186 |
+
|
| 187 |
+
return merged
|
| 188 |
+
|
| 189 |
+
@staticmethod
|
| 190 |
+
def _overlaps_any(sorted_intervals: List[Tuple[int, int]], start: int, end: int) -> bool:
|
| 191 |
+
"""Binary search to check if [start, end) overlaps any interval."""
|
| 192 |
+
lo, hi = 0, len(sorted_intervals) - 1
|
| 193 |
+
while lo <= hi:
|
| 194 |
+
mid = (lo + hi) // 2
|
| 195 |
+
iv_start, iv_end = sorted_intervals[mid]
|
| 196 |
+
if iv_end <= start:
|
| 197 |
+
lo = mid + 1
|
| 198 |
+
elif iv_start >= end:
|
| 199 |
+
hi = mid - 1
|
| 200 |
+
else:
|
| 201 |
+
return True # overlap
|
| 202 |
+
return False
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
class RelationExtractor:
|
| 206 |
+
"""Relation extraction with proximity scoring and source attribution."""
|
| 207 |
+
|
| 208 |
+
def extract(self, text: str, entities: List[Entity], chunk_id: str = "") -> List[Relation]:
|
| 209 |
+
relations = []
|
| 210 |
+
pattern_rels = self._extract_patterns(text, entities, chunk_id)
|
| 211 |
+
relations.extend(pattern_rels)
|
| 212 |
+
proximity_rels = self._extract_proximity(text, entities, chunk_id)
|
| 213 |
+
relations.extend(proximity_rels)
|
| 214 |
+
return relations
|
| 215 |
+
|
| 216 |
+
def _extract_patterns(self, text: str, entities: List[Entity], chunk_id: str) -> List[Relation]:
|
| 217 |
+
if not entities:
|
| 218 |
+
return []
|
| 219 |
+
|
| 220 |
+
entity_index = {ent.normalized: ent for ent in entities}
|
| 221 |
+
relations = []
|
| 222 |
+
seen = set()
|
| 223 |
+
|
| 224 |
+
for pattern, rel_type in _RELATION_PATTERNS:
|
| 225 |
+
for match in pattern.finditer(text):
|
| 226 |
+
evidence = match.group(0)
|
| 227 |
+
evidence_lower = evidence.lower()
|
| 228 |
+
mentioned = [ent for key, ent in entity_index.items() if key in evidence_lower]
|
| 229 |
+
mentioned = list(dict.fromkeys(mentioned))
|
| 230 |
+
|
| 231 |
+
if len(mentioned) >= 2:
|
| 232 |
+
rel_key = (mentioned[0].entity_id, mentioned[1].entity_id, rel_type)
|
| 233 |
+
if rel_key not in seen:
|
| 234 |
+
seen.add(rel_key)
|
| 235 |
+
relations.append(Relation(
|
| 236 |
+
relation_id=generate_id("rel", chunk_id, *rel_key),
|
| 237 |
+
source_entity_id=mentioned[0].entity_id,
|
| 238 |
+
target_entity_id=mentioned[1].entity_id,
|
| 239 |
+
relation_type=rel_type,
|
| 240 |
+
text_evidence=evidence[:200],
|
| 241 |
+
confidence=0.9,
|
| 242 |
+
source="pattern",
|
| 243 |
+
))
|
| 244 |
+
|
| 245 |
+
return relations
|
| 246 |
+
|
| 247 |
+
def _extract_proximity(self, text: str, entities: List[Entity], chunk_id: str) -> List[Relation]:
|
| 248 |
+
"""Sliding-window proximity extraction — O(n) instead of O(n²) in practice."""
|
| 249 |
+
if len(entities) < 2:
|
| 250 |
+
return []
|
| 251 |
+
|
| 252 |
+
text_lower = text.lower()
|
| 253 |
+
entity_positions = []
|
| 254 |
+
for ent in entities:
|
| 255 |
+
pos = text_lower.find(ent.normalized)
|
| 256 |
+
if pos >= 0:
|
| 257 |
+
entity_positions.append((ent, pos))
|
| 258 |
+
|
| 259 |
+
if len(entity_positions) < 2:
|
| 260 |
+
return []
|
| 261 |
+
|
| 262 |
+
entity_positions.sort(key=lambda x: x[1])
|
| 263 |
+
|
| 264 |
+
relations = []
|
| 265 |
+
seen = set()
|
| 266 |
+
|
| 267 |
+
# Sliding window: for each entity, only check forward within threshold
|
| 268 |
+
for i in range(len(entity_positions)):
|
| 269 |
+
ent_a, pos_a = entity_positions[i]
|
| 270 |
+
for j in range(i + 1, len(entity_positions)):
|
| 271 |
+
ent_b, pos_b = entity_positions[j]
|
| 272 |
+
|
| 273 |
+
distance = pos_b - pos_a
|
| 274 |
+
if distance > PROXIMITY_THRESHOLD:
|
| 275 |
+
break
|
| 276 |
+
|
| 277 |
+
if ent_a.entity_type == ent_b.entity_type:
|
| 278 |
+
continue
|
| 279 |
+
|
| 280 |
+
rel_key = (ent_a.entity_id, ent_b.entity_id)
|
| 281 |
+
if rel_key in seen:
|
| 282 |
+
continue
|
| 283 |
+
seen.add(rel_key)
|
| 284 |
+
|
| 285 |
+
confidence = max(0.3, 1.0 - (distance / PROXIMITY_THRESHOLD) * 0.6)
|
| 286 |
+
context_start = max(0, pos_a - 20)
|
| 287 |
+
context_end = min(len(text), pos_b + len(ent_b.normalized) + 20)
|
| 288 |
+
evidence = text[context_start:context_end]
|
| 289 |
+
|
| 290 |
+
relations.append(Relation(
|
| 291 |
+
relation_id=generate_id("prox", chunk_id, ent_a.entity_id, ent_b.entity_id),
|
| 292 |
+
source_entity_id=ent_a.entity_id,
|
| 293 |
+
target_entity_id=ent_b.entity_id,
|
| 294 |
+
relation_type="related_to",
|
| 295 |
+
text_evidence=evidence[:200],
|
| 296 |
+
confidence=confidence,
|
| 297 |
+
source="proximity",
|
| 298 |
+
))
|
| 299 |
+
|
| 300 |
+
return relations
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
class EntityDeduplicator:
|
| 304 |
+
def __init__(self, similarity_threshold: float = 0.85, embedder=None):
|
| 305 |
+
self.similarity_threshold = similarity_threshold
|
| 306 |
+
self._embedder = embedder
|
| 307 |
+
|
| 308 |
+
@property
|
| 309 |
+
def embedder(self):
|
| 310 |
+
if self._embedder is None:
|
| 311 |
+
from .embeddings import get_embedder
|
| 312 |
+
self._embedder = get_embedder()
|
| 313 |
+
return self._embedder
|
| 314 |
+
|
| 315 |
+
def deduplicate(self, entities: List[Entity]) -> Tuple[List[Entity], Dict[str, str]]:
|
| 316 |
+
if not entities:
|
| 317 |
+
return [], {}
|
| 318 |
+
|
| 319 |
+
by_type = defaultdict(list)
|
| 320 |
+
for ent in entities:
|
| 321 |
+
by_type[ent.entity_type].append(ent)
|
| 322 |
+
|
| 323 |
+
canonical_entities = []
|
| 324 |
+
id_mapping = {}
|
| 325 |
+
|
| 326 |
+
for entity_type, type_entities in by_type.items():
|
| 327 |
+
if len(type_entities) == 1:
|
| 328 |
+
canonical_entities.append(type_entities[0])
|
| 329 |
+
id_mapping[type_entities[0].entity_id] = type_entities[0].entity_id
|
| 330 |
+
continue
|
| 331 |
+
|
| 332 |
+
try:
|
| 333 |
+
texts = [ent.value for ent in type_entities]
|
| 334 |
+
embeddings = self.embedder.embed_batch(texts, return_sparse=False)
|
| 335 |
+
vectors = np.array([e.dense for e in embeddings])
|
| 336 |
+
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
|
| 337 |
+
vectors = vectors / (norms + 1e-9)
|
| 338 |
+
|
| 339 |
+
clusters = self._cluster_union_find(vectors)
|
| 340 |
+
|
| 341 |
+
for cluster in clusters:
|
| 342 |
+
cluster_entities = [type_entities[i] for i in cluster]
|
| 343 |
+
canonical = max(cluster_entities, key=lambda e: (e.confidence, len(e.value)))
|
| 344 |
+
all_doc_ids = set()
|
| 345 |
+
for ent in cluster_entities:
|
| 346 |
+
all_doc_ids.update(ent.doc_ids)
|
| 347 |
+
canonical.doc_ids = list(all_doc_ids)
|
| 348 |
+
canonical_entities.append(canonical)
|
| 349 |
+
for idx in cluster:
|
| 350 |
+
id_mapping[type_entities[idx].entity_id] = canonical.entity_id
|
| 351 |
+
|
| 352 |
+
except Exception:
|
| 353 |
+
for ent in type_entities:
|
| 354 |
+
canonical_entities.append(ent)
|
| 355 |
+
id_mapping[ent.entity_id] = ent.entity_id
|
| 356 |
+
|
| 357 |
+
return canonical_entities, id_mapping
|
| 358 |
+
|
| 359 |
+
def _cluster_union_find(self, vectors: np.ndarray) -> List[List[int]]:
|
| 360 |
+
"""Vectorized union-find: compute similarity matrix once, extract pairs via np.triu_indices."""
|
| 361 |
+
n = len(vectors)
|
| 362 |
+
if n == 0:
|
| 363 |
+
return []
|
| 364 |
+
|
| 365 |
+
parent = list(range(n))
|
| 366 |
+
|
| 367 |
+
def find(x):
|
| 368 |
+
while parent[x] != x:
|
| 369 |
+
parent[x] = parent[parent[x]] # path halving (iterative, avoids recursion limit)
|
| 370 |
+
x = parent[x]
|
| 371 |
+
return x
|
| 372 |
+
|
| 373 |
+
def union(x, y):
|
| 374 |
+
px, py = find(x), find(y)
|
| 375 |
+
if px != py:
|
| 376 |
+
parent[px] = py
|
| 377 |
+
|
| 378 |
+
sim_matrix = vectors @ vectors.T
|
| 379 |
+
# Extract upper triangle indices where similarity exceeds threshold
|
| 380 |
+
rows, cols = np.triu_indices(n, k=1)
|
| 381 |
+
mask = sim_matrix[rows, cols] >= self.similarity_threshold
|
| 382 |
+
for i, j in zip(rows[mask], cols[mask]):
|
| 383 |
+
union(int(i), int(j))
|
| 384 |
+
|
| 385 |
+
clusters = defaultdict(list)
|
| 386 |
+
for i in range(n):
|
| 387 |
+
clusters[find(i)].append(i)
|
| 388 |
+
|
| 389 |
+
return list(clusters.values())
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
class CorpusDeduplicator:
|
| 393 |
+
"""Corpus-wide entity and relation deduplication."""
|
| 394 |
+
|
| 395 |
+
def __init__(self, similarity_threshold: float = 0.85, embedder=None):
|
| 396 |
+
self.entity_dedup = EntityDeduplicator(similarity_threshold, embedder)
|
| 397 |
+
|
| 398 |
+
def deduplicate_entities(self, all_entities: List[Entity]) -> Tuple[List[Entity], Dict[str, str]]:
|
| 399 |
+
return self.entity_dedup.deduplicate(all_entities)
|
| 400 |
+
|
| 401 |
+
def deduplicate_relations(
|
| 402 |
+
self,
|
| 403 |
+
relations: List[Relation],
|
| 404 |
+
entity_id_mapping: Dict[str, str],
|
| 405 |
+
) -> List[Relation]:
|
| 406 |
+
seen = set()
|
| 407 |
+
deduplicated = []
|
| 408 |
+
|
| 409 |
+
for rel in relations:
|
| 410 |
+
source_id = entity_id_mapping.get(rel.source_entity_id, rel.source_entity_id)
|
| 411 |
+
target_id = entity_id_mapping.get(rel.target_entity_id, rel.target_entity_id)
|
| 412 |
+
|
| 413 |
+
rel_key = (source_id, target_id, rel.relation_type)
|
| 414 |
+
if rel_key in seen:
|
| 415 |
+
continue
|
| 416 |
+
seen.add(rel_key)
|
| 417 |
+
|
| 418 |
+
deduplicated.append(Relation(
|
| 419 |
+
relation_id=rel.relation_id,
|
| 420 |
+
source_entity_id=source_id,
|
| 421 |
+
target_entity_id=target_id,
|
| 422 |
+
relation_type=rel.relation_type,
|
| 423 |
+
text_evidence=rel.text_evidence,
|
| 424 |
+
confidence=rel.confidence,
|
| 425 |
+
source=rel.source,
|
| 426 |
+
))
|
| 427 |
+
|
| 428 |
+
return deduplicated
|
graphrag_v4/graph_builder.py
ADDED
|
@@ -0,0 +1,501 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""GraphRAG v4 - Knowledge Graph Builder (Optimized)
|
| 3 |
+
|
| 4 |
+
Optimizations:
|
| 5 |
+
- load_corpus: inverted chunk_entity_map built once (O(E) not O(E×C))
|
| 6 |
+
- load_graph: restores fully-built graph from save() output — no re-computation
|
| 7 |
+
- build_cross_document_edges: returns 0 on empty (bug fix)
|
| 8 |
+
- doc_ids properly deserialized (bug fix)
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import json
|
| 12 |
+
import math
|
| 13 |
+
import pickle
|
| 14 |
+
from collections import defaultdict
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Dict, List, Optional, Set
|
| 17 |
+
|
| 18 |
+
import networkx as nx
|
| 19 |
+
|
| 20 |
+
from .models import Chunk, Entity, Relation, Community, NodeType, EdgeType, generate_id
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
EDGE_WEIGHTS = {
|
| 24 |
+
EdgeType.CONTAINS.value: 2.0,
|
| 25 |
+
EdgeType.NEXT.value: 1.0,
|
| 26 |
+
EdgeType.MENTIONS.value: 1.0,
|
| 27 |
+
EdgeType.RELATES_TO.value: 1.2,
|
| 28 |
+
EdgeType.CO_OCCURS.value: 0.8,
|
| 29 |
+
EdgeType.CROSS_DOC.value: 1.5,
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def edge_weight(edge_type: str, properties: dict = None) -> float:
|
| 34 |
+
base = EDGE_WEIGHTS.get(edge_type, 1.0)
|
| 35 |
+
if properties:
|
| 36 |
+
conf = properties.get("confidence", 1.0)
|
| 37 |
+
if isinstance(conf, (int, float)):
|
| 38 |
+
base *= conf
|
| 39 |
+
count = properties.get("count", 1)
|
| 40 |
+
if isinstance(count, int) and count > 1:
|
| 41 |
+
base += math.log1p(count) * 0.3
|
| 42 |
+
return base
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class KnowledgeGraphBuilder:
|
| 46 |
+
def __init__(self):
|
| 47 |
+
self.graph = nx.DiGraph()
|
| 48 |
+
self.communities: Dict[str, Community] = {}
|
| 49 |
+
self.node_to_community: Dict[str, str] = {}
|
| 50 |
+
self._entities_by_type: Dict[str, Set[str]] = defaultdict(set)
|
| 51 |
+
self._chunks_by_doc: Dict[str, List[str]] = defaultdict(list)
|
| 52 |
+
self._entity_to_chunks: Dict[str, Set[str]] = defaultdict(set)
|
| 53 |
+
self._entity_to_docs: Dict[str, Set[str]] = defaultdict(set)
|
| 54 |
+
|
| 55 |
+
# ── Node/edge construction ───────────────────────────────────────
|
| 56 |
+
|
| 57 |
+
def add_document(self, doc_id: str, doc_type: str, title: str, metadata: dict = None):
|
| 58 |
+
self.graph.add_node(
|
| 59 |
+
doc_id,
|
| 60 |
+
node_type=NodeType.DOCUMENT.value,
|
| 61 |
+
doc_type=doc_type,
|
| 62 |
+
title=title,
|
| 63 |
+
**(metadata or {}),
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
def add_chunk(self, chunk: Chunk, prev_chunk_id: str = None):
|
| 67 |
+
self.graph.add_node(
|
| 68 |
+
chunk.chunk_id,
|
| 69 |
+
node_type=NodeType.CHUNK.value,
|
| 70 |
+
doc_id=chunk.doc_id,
|
| 71 |
+
doc_type=chunk.doc_type,
|
| 72 |
+
title=chunk.title,
|
| 73 |
+
text=chunk.text,
|
| 74 |
+
parent_context=chunk.parent_context,
|
| 75 |
+
page=chunk.page,
|
| 76 |
+
)
|
| 77 |
+
self._chunks_by_doc[chunk.doc_id].append(chunk.chunk_id)
|
| 78 |
+
|
| 79 |
+
if chunk.doc_id in self.graph:
|
| 80 |
+
self.graph.add_edge(
|
| 81 |
+
chunk.doc_id, chunk.chunk_id,
|
| 82 |
+
edge_type=EdgeType.CONTAINS.value,
|
| 83 |
+
weight=edge_weight(EdgeType.CONTAINS.value),
|
| 84 |
+
)
|
| 85 |
+
if prev_chunk_id and prev_chunk_id in self.graph:
|
| 86 |
+
self.graph.add_edge(
|
| 87 |
+
prev_chunk_id, chunk.chunk_id,
|
| 88 |
+
edge_type=EdgeType.NEXT.value,
|
| 89 |
+
weight=edge_weight(EdgeType.NEXT.value),
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
def add_entity(self, entity: Entity, chunk_ids: List[str] = None):
|
| 93 |
+
if entity.entity_id not in self.graph:
|
| 94 |
+
self.graph.add_node(
|
| 95 |
+
entity.entity_id,
|
| 96 |
+
node_type=NodeType.ENTITY.value,
|
| 97 |
+
entity_type=entity.entity_type,
|
| 98 |
+
value=entity.value,
|
| 99 |
+
normalized=entity.normalized,
|
| 100 |
+
confidence=entity.confidence,
|
| 101 |
+
)
|
| 102 |
+
self._entities_by_type[entity.entity_type].add(entity.entity_id)
|
| 103 |
+
|
| 104 |
+
if chunk_ids:
|
| 105 |
+
for chunk_id in chunk_ids:
|
| 106 |
+
if chunk_id in self.graph:
|
| 107 |
+
self.graph.add_edge(
|
| 108 |
+
chunk_id, entity.entity_id,
|
| 109 |
+
edge_type=EdgeType.MENTIONS.value,
|
| 110 |
+
weight=edge_weight(EdgeType.MENTIONS.value),
|
| 111 |
+
)
|
| 112 |
+
self._entity_to_chunks[entity.entity_id].add(chunk_id)
|
| 113 |
+
chunk_data = self.graph.nodes.get(chunk_id, {})
|
| 114 |
+
doc_id = chunk_data.get("doc_id")
|
| 115 |
+
if doc_id:
|
| 116 |
+
self._entity_to_docs[entity.entity_id].add(doc_id)
|
| 117 |
+
|
| 118 |
+
def add_relation(self, relation: Relation):
|
| 119 |
+
if relation.source_entity_id not in self.graph or relation.target_entity_id not in self.graph:
|
| 120 |
+
return
|
| 121 |
+
self.graph.add_edge(
|
| 122 |
+
relation.source_entity_id,
|
| 123 |
+
relation.target_entity_id,
|
| 124 |
+
edge_type=EdgeType.RELATES_TO.value,
|
| 125 |
+
relation_type=relation.relation_type,
|
| 126 |
+
evidence=relation.text_evidence,
|
| 127 |
+
confidence=relation.confidence,
|
| 128 |
+
source=relation.source,
|
| 129 |
+
weight=edge_weight(EdgeType.RELATES_TO.value, {"confidence": relation.confidence}),
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# ── Graph algorithms ─────────────────────────────────────────────
|
| 133 |
+
|
| 134 |
+
def build_cross_document_edges(self) -> int:
|
| 135 |
+
cross_doc_entities = {
|
| 136 |
+
eid: docs for eid, docs in self._entity_to_docs.items()
|
| 137 |
+
if len(docs) > 1
|
| 138 |
+
}
|
| 139 |
+
if not cross_doc_entities:
|
| 140 |
+
return 0
|
| 141 |
+
|
| 142 |
+
edges_added = 0
|
| 143 |
+
for entity_id, doc_ids in cross_doc_entities.items():
|
| 144 |
+
chunks = list(self._entity_to_chunks.get(entity_id, set()))
|
| 145 |
+
if len(chunks) < 2:
|
| 146 |
+
continue
|
| 147 |
+
|
| 148 |
+
chunks_by_doc = defaultdict(list)
|
| 149 |
+
for chunk_id in chunks:
|
| 150 |
+
chunk_data = self.graph.nodes.get(chunk_id, {})
|
| 151 |
+
doc_id = chunk_data.get("doc_id")
|
| 152 |
+
if doc_id:
|
| 153 |
+
chunks_by_doc[doc_id].append(chunk_id)
|
| 154 |
+
|
| 155 |
+
doc_list = list(chunks_by_doc.keys())
|
| 156 |
+
for i in range(len(doc_list)):
|
| 157 |
+
for j in range(i + 1, len(doc_list)):
|
| 158 |
+
for chunk_a in chunks_by_doc[doc_list[i]][:2]:
|
| 159 |
+
for chunk_b in chunks_by_doc[doc_list[j]][:2]:
|
| 160 |
+
if not self.graph.has_edge(chunk_a, chunk_b):
|
| 161 |
+
entity_value = self.graph.nodes.get(entity_id, {}).get("value", "")
|
| 162 |
+
w = edge_weight(EdgeType.CROSS_DOC.value)
|
| 163 |
+
self.graph.add_edge(chunk_a, chunk_b, edge_type=EdgeType.CROSS_DOC.value, shared_entity=entity_id, shared_entity_value=entity_value, weight=w)
|
| 164 |
+
self.graph.add_edge(chunk_b, chunk_a, edge_type=EdgeType.CROSS_DOC.value, shared_entity=entity_id, shared_entity_value=entity_value, weight=w)
|
| 165 |
+
edges_added += 1
|
| 166 |
+
return edges_added
|
| 167 |
+
|
| 168 |
+
def build_cooccurrence_edges(self, max_entities_per_chunk: int = 20):
|
| 169 |
+
chunk_to_entities = defaultdict(list)
|
| 170 |
+
for u, v, data in self.graph.edges(data=True):
|
| 171 |
+
if data.get("edge_type") == EdgeType.MENTIONS.value:
|
| 172 |
+
if self.graph.nodes.get(v, {}).get("node_type") == NodeType.ENTITY.value:
|
| 173 |
+
chunk_to_entities[u].append(v)
|
| 174 |
+
|
| 175 |
+
pair_counts = defaultdict(int)
|
| 176 |
+
for chunk_id, entities in chunk_to_entities.items():
|
| 177 |
+
entities = list(set(entities))[:max_entities_per_chunk]
|
| 178 |
+
for i in range(len(entities)):
|
| 179 |
+
for j in range(i + 1, len(entities)):
|
| 180 |
+
a, b = (entities[i], entities[j]) if entities[i] < entities[j] else (entities[j], entities[i])
|
| 181 |
+
pair_counts[(a, b)] += 1
|
| 182 |
+
|
| 183 |
+
for (a, b), count in pair_counts.items():
|
| 184 |
+
if count >= 2:
|
| 185 |
+
weight = edge_weight(EdgeType.CO_OCCURS.value, {"count": count})
|
| 186 |
+
self.graph.add_edge(a, b, edge_type=EdgeType.CO_OCCURS.value, count=count, weight=weight)
|
| 187 |
+
self.graph.add_edge(b, a, edge_type=EdgeType.CO_OCCURS.value, count=count, weight=weight)
|
| 188 |
+
|
| 189 |
+
def compute_pagerank(self):
|
| 190 |
+
if self.graph.number_of_nodes() == 0:
|
| 191 |
+
return
|
| 192 |
+
try:
|
| 193 |
+
pr = nx.pagerank(self.graph, alpha=0.85, weight="weight")
|
| 194 |
+
nx.set_node_attributes(self.graph, pr, "pagerank")
|
| 195 |
+
except Exception:
|
| 196 |
+
pass
|
| 197 |
+
|
| 198 |
+
def detect_communities(self, resolution: float = 1.0, n_levels: int = 2) -> Dict[str, Community]:
|
| 199 |
+
undirected = self.graph.to_undirected()
|
| 200 |
+
if undirected.number_of_nodes() == 0:
|
| 201 |
+
return {}
|
| 202 |
+
|
| 203 |
+
communities_list = self._leiden_communities(undirected, resolution)
|
| 204 |
+
if communities_list is None:
|
| 205 |
+
communities_list = self._group_by_document()
|
| 206 |
+
|
| 207 |
+
for i, node_set in enumerate(communities_list):
|
| 208 |
+
community_id = f"community_0_{i}"
|
| 209 |
+
node_list = list(node_set)
|
| 210 |
+
community = Community(
|
| 211 |
+
community_id=community_id, level=0, node_ids=node_list,
|
| 212 |
+
key_entities=self._extract_key_entities(node_list),
|
| 213 |
+
key_topics=self._extract_key_topics(node_list),
|
| 214 |
+
)
|
| 215 |
+
self.communities[community_id] = community
|
| 216 |
+
for nid in node_list:
|
| 217 |
+
self.node_to_community[nid] = community_id
|
| 218 |
+
|
| 219 |
+
if n_levels > 1 and len(communities_list) > 10:
|
| 220 |
+
coarse_communities = self._leiden_communities(undirected, resolution * 0.5)
|
| 221 |
+
if coarse_communities:
|
| 222 |
+
for i, node_set in enumerate(coarse_communities):
|
| 223 |
+
community_id = f"community_1_{i}"
|
| 224 |
+
node_list = list(node_set)
|
| 225 |
+
community = Community(
|
| 226 |
+
community_id=community_id, level=1, node_ids=node_list,
|
| 227 |
+
key_entities=self._extract_key_entities(node_list),
|
| 228 |
+
key_topics=self._extract_key_topics(node_list),
|
| 229 |
+
)
|
| 230 |
+
self.communities[community_id] = community
|
| 231 |
+
return self.communities
|
| 232 |
+
|
| 233 |
+
def _leiden_communities(self, G: nx.Graph, resolution: float) -> Optional[List[Set[str]]]:
|
| 234 |
+
try:
|
| 235 |
+
import igraph as ig
|
| 236 |
+
import leidenalg as la
|
| 237 |
+
nodes = list(G.nodes())
|
| 238 |
+
if not nodes:
|
| 239 |
+
return None
|
| 240 |
+
idx = {n: i for i, n in enumerate(nodes)}
|
| 241 |
+
edges = [(idx[u], idx[v]) for u, v in G.edges() if u in idx and v in idx]
|
| 242 |
+
weights = [G[u][v].get("weight", 1.0) for u, v in G.edges() if u in idx and v in idx]
|
| 243 |
+
g = ig.Graph(n=len(nodes), edges=edges, directed=False)
|
| 244 |
+
g.es["weight"] = weights
|
| 245 |
+
partition = la.find_partition(g, la.RBConfigurationVertexPartition, weights=g.es["weight"], resolution_parameter=resolution)
|
| 246 |
+
communities = defaultdict(set)
|
| 247 |
+
for node_i, comm_id in enumerate(partition.membership):
|
| 248 |
+
communities[comm_id].add(nodes[node_i])
|
| 249 |
+
return list(communities.values())
|
| 250 |
+
except Exception:
|
| 251 |
+
return None
|
| 252 |
+
|
| 253 |
+
def _group_by_document(self) -> List[Set[str]]:
|
| 254 |
+
doc_groups = defaultdict(set)
|
| 255 |
+
for nid in self.graph.nodes():
|
| 256 |
+
data = self.graph.nodes[nid]
|
| 257 |
+
doc_id = data.get("doc_id") or data.get("doc") or "unknown"
|
| 258 |
+
doc_groups[doc_id].add(nid)
|
| 259 |
+
return list(doc_groups.values())
|
| 260 |
+
|
| 261 |
+
def _extract_key_entities(self, node_ids: List[str], top_k: int = 8) -> List[str]:
|
| 262 |
+
entity_scores = defaultdict(float)
|
| 263 |
+
for nid in node_ids:
|
| 264 |
+
if nid not in self.graph:
|
| 265 |
+
continue
|
| 266 |
+
for neighbor in self.graph.neighbors(nid):
|
| 267 |
+
neighbor_data = self.graph.nodes.get(neighbor, {})
|
| 268 |
+
if neighbor_data.get("node_type") == NodeType.ENTITY.value:
|
| 269 |
+
value = neighbor_data.get("value", "")
|
| 270 |
+
if value and len(value) > 3:
|
| 271 |
+
pr = neighbor_data.get("pagerank", 0.01)
|
| 272 |
+
entity_scores[value] += 1.0 + 10.0 * pr
|
| 273 |
+
sorted_entities = sorted(entity_scores.items(), key=lambda x: -x[1])
|
| 274 |
+
return [e[0] for e in sorted_entities[:top_k]]
|
| 275 |
+
|
| 276 |
+
def _extract_key_topics(self, node_ids: List[str], top_k: int = 5) -> List[str]:
|
| 277 |
+
titles = []
|
| 278 |
+
for nid in node_ids:
|
| 279 |
+
data = self.graph.nodes.get(nid, {})
|
| 280 |
+
if data.get("node_type") in {NodeType.CHUNK.value, NodeType.DOCUMENT.value}:
|
| 281 |
+
title = data.get("title", "")
|
| 282 |
+
if title and len(title) > 5:
|
| 283 |
+
titles.append(title[:100])
|
| 284 |
+
return titles[:top_k]
|
| 285 |
+
|
| 286 |
+
def generate_community_summaries(self):
|
| 287 |
+
for comm in self.communities.values():
|
| 288 |
+
comm.summary = self._build_summary(comm)
|
| 289 |
+
|
| 290 |
+
def _build_summary(self, comm: Community) -> str:
|
| 291 |
+
chunk_texts = []
|
| 292 |
+
for nid in comm.node_ids:
|
| 293 |
+
data = self.graph.nodes.get(nid, {})
|
| 294 |
+
if data.get("node_type") == NodeType.CHUNK.value:
|
| 295 |
+
text = data.get("text", "")
|
| 296 |
+
pr = data.get("pagerank", 0)
|
| 297 |
+
if text:
|
| 298 |
+
chunk_texts.append((text[:200], pr))
|
| 299 |
+
chunk_texts.sort(key=lambda x: -x[1])
|
| 300 |
+
top_texts = [t for t, _ in chunk_texts[:3]]
|
| 301 |
+
|
| 302 |
+
parts = []
|
| 303 |
+
if comm.key_entities:
|
| 304 |
+
parts.append(f"Key entities: {', '.join(comm.key_entities[:5])}")
|
| 305 |
+
if comm.key_topics:
|
| 306 |
+
parts.append(f"Topics: {'; '.join(comm.key_topics[:3])}")
|
| 307 |
+
if top_texts:
|
| 308 |
+
parts.append(f"Content: {' '.join(top_texts)[:300]}")
|
| 309 |
+
return ". ".join(parts) if parts else f"Community with {comm.size} nodes"
|
| 310 |
+
|
| 311 |
+
# ── Load from corpus (raw JSONL — full rebuild) ──────────────────
|
| 312 |
+
|
| 313 |
+
def load_corpus(self, corpus_dir: Path):
|
| 314 |
+
corpus_dir = Path(corpus_dir)
|
| 315 |
+
|
| 316 |
+
docs_file = corpus_dir / "documents.jsonl"
|
| 317 |
+
if docs_file.exists():
|
| 318 |
+
with open(docs_file, "r", encoding="utf-8") as f:
|
| 319 |
+
for line in f:
|
| 320 |
+
doc = json.loads(line)
|
| 321 |
+
self.add_document(
|
| 322 |
+
doc_id=doc.get("doc_id"),
|
| 323 |
+
doc_type=doc.get("doc_type", "unknown"),
|
| 324 |
+
title=doc.get("title", ""),
|
| 325 |
+
metadata={"filename": doc.get("filename", "")},
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
chunks_file = corpus_dir / "chunks.jsonl"
|
| 329 |
+
prev_chunk_id = None
|
| 330 |
+
prev_doc_id = None
|
| 331 |
+
if chunks_file.exists():
|
| 332 |
+
with open(chunks_file, "r", encoding="utf-8") as f:
|
| 333 |
+
for line in f:
|
| 334 |
+
data = json.loads(line)
|
| 335 |
+
chunk = Chunk(
|
| 336 |
+
chunk_id=data.get("chunk_id"), doc_id=data.get("doc_id"),
|
| 337 |
+
doc_type=data.get("doc_type", "unknown"), text=data.get("text", ""),
|
| 338 |
+
title=data.get("title", ""), parent_context=data.get("parent_context", ""),
|
| 339 |
+
page=data.get("page", 1),
|
| 340 |
+
)
|
| 341 |
+
link_prev = prev_chunk_id if prev_doc_id == chunk.doc_id else None
|
| 342 |
+
self.add_chunk(chunk, link_prev)
|
| 343 |
+
prev_chunk_id = chunk.chunk_id
|
| 344 |
+
prev_doc_id = chunk.doc_id
|
| 345 |
+
|
| 346 |
+
chunk_entities_file = corpus_dir / "chunk_entities.json"
|
| 347 |
+
chunk_entity_map = {}
|
| 348 |
+
if chunk_entities_file.exists():
|
| 349 |
+
with open(chunk_entities_file, "r", encoding="utf-8") as f:
|
| 350 |
+
chunk_entity_map = json.load(f)
|
| 351 |
+
|
| 352 |
+
entity_to_chunk_ids = defaultdict(list)
|
| 353 |
+
for cid, eids in chunk_entity_map.items():
|
| 354 |
+
for eid in eids:
|
| 355 |
+
entity_to_chunk_ids[eid].append(cid)
|
| 356 |
+
|
| 357 |
+
entities_file = corpus_dir / "entities.jsonl"
|
| 358 |
+
if entities_file.exists():
|
| 359 |
+
with open(entities_file, "r", encoding="utf-8") as f:
|
| 360 |
+
for line in f:
|
| 361 |
+
data = json.loads(line)
|
| 362 |
+
entity = Entity(
|
| 363 |
+
entity_id=data.get("entity_id"), entity_type=data.get("entity_type"),
|
| 364 |
+
value=data.get("value", ""), normalized=data.get("normalized", ""),
|
| 365 |
+
confidence=data.get("confidence", 0.8), doc_ids=data.get("doc_ids", []),
|
| 366 |
+
)
|
| 367 |
+
chunk_ids = entity_to_chunk_ids.get(entity.entity_id, [])
|
| 368 |
+
self.add_entity(entity, chunk_ids)
|
| 369 |
+
|
| 370 |
+
relations_file = corpus_dir / "relations.jsonl"
|
| 371 |
+
if relations_file.exists():
|
| 372 |
+
with open(relations_file, "r", encoding="utf-8") as f:
|
| 373 |
+
for line in f:
|
| 374 |
+
data = json.loads(line)
|
| 375 |
+
relation = Relation(
|
| 376 |
+
relation_id=data.get("relation_id"),
|
| 377 |
+
source_entity_id=data.get("source_entity_id"),
|
| 378 |
+
target_entity_id=data.get("target_entity_id"),
|
| 379 |
+
relation_type=data.get("relation_type"),
|
| 380 |
+
text_evidence=data.get("text_evidence", ""),
|
| 381 |
+
confidence=data.get("confidence", 1.0),
|
| 382 |
+
source=data.get("source", "pattern"),
|
| 383 |
+
)
|
| 384 |
+
self.add_relation(relation)
|
| 385 |
+
|
| 386 |
+
# ── Load pre-built graph (fast — no re-computation) ──────────────
|
| 387 |
+
|
| 388 |
+
def load_graph(self, graph_dir: Path) -> bool:
|
| 389 |
+
"""Restore a fully-built graph from save() output.
|
| 390 |
+
|
| 391 |
+
This skips ALL expensive steps:
|
| 392 |
+
- No corpus parsing
|
| 393 |
+
- No co-occurrence edge building
|
| 394 |
+
- No cross-document edge building
|
| 395 |
+
- No PageRank computation
|
| 396 |
+
- No Leiden community detection
|
| 397 |
+
- No community summary generation
|
| 398 |
+
|
| 399 |
+
Returns True if successfully loaded, False otherwise.
|
| 400 |
+
"""
|
| 401 |
+
graph_dir = Path(graph_dir)
|
| 402 |
+
|
| 403 |
+
nodes_file = graph_dir / "graph_nodes.jsonl"
|
| 404 |
+
edges_file = graph_dir / "graph_edges.jsonl"
|
| 405 |
+
comms_file = graph_dir / "communities.json"
|
| 406 |
+
|
| 407 |
+
if not nodes_file.exists() or not edges_file.exists():
|
| 408 |
+
return False
|
| 409 |
+
|
| 410 |
+
# Restore nodes
|
| 411 |
+
with open(nodes_file, "r", encoding="utf-8") as f:
|
| 412 |
+
for line in f:
|
| 413 |
+
data = json.loads(line)
|
| 414 |
+
node_id = data.pop("node_id")
|
| 415 |
+
self.graph.add_node(node_id, **data)
|
| 416 |
+
|
| 417 |
+
# Rebuild internal indexes
|
| 418 |
+
ntype = data.get("node_type")
|
| 419 |
+
if ntype == NodeType.ENTITY.value:
|
| 420 |
+
self._entities_by_type[data.get("entity_type", "")].add(node_id)
|
| 421 |
+
elif ntype == NodeType.CHUNK.value:
|
| 422 |
+
doc_id = data.get("doc_id")
|
| 423 |
+
if doc_id:
|
| 424 |
+
self._chunks_by_doc[doc_id].append(node_id)
|
| 425 |
+
|
| 426 |
+
# Restore edges
|
| 427 |
+
with open(edges_file, "r", encoding="utf-8") as f:
|
| 428 |
+
for line in f:
|
| 429 |
+
data = json.loads(line)
|
| 430 |
+
source = data.pop("source")
|
| 431 |
+
target = data.pop("target")
|
| 432 |
+
self.graph.add_edge(source, target, **data)
|
| 433 |
+
|
| 434 |
+
# Rebuild entity→chunk and entity→doc indexes from MENTIONS edges
|
| 435 |
+
if data.get("edge_type") == EdgeType.MENTIONS.value:
|
| 436 |
+
self._entity_to_chunks[target].add(source)
|
| 437 |
+
source_data = self.graph.nodes.get(source, {})
|
| 438 |
+
doc_id = source_data.get("doc_id")
|
| 439 |
+
if doc_id:
|
| 440 |
+
self._entity_to_docs[target].add(doc_id)
|
| 441 |
+
|
| 442 |
+
# Restore communities
|
| 443 |
+
if comms_file.exists():
|
| 444 |
+
with open(comms_file, "r", encoding="utf-8") as f:
|
| 445 |
+
comms_data = json.load(f)
|
| 446 |
+
for cd in comms_data:
|
| 447 |
+
comm = Community(
|
| 448 |
+
community_id=cd["community_id"],
|
| 449 |
+
level=cd.get("level", 0),
|
| 450 |
+
node_ids=cd.get("node_ids", []),
|
| 451 |
+
summary=cd.get("summary", ""),
|
| 452 |
+
key_entities=cd.get("key_entities", []),
|
| 453 |
+
key_topics=cd.get("key_topics", []),
|
| 454 |
+
)
|
| 455 |
+
self.communities[comm.community_id] = comm
|
| 456 |
+
for nid in comm.node_ids:
|
| 457 |
+
self.node_to_community[nid] = comm.community_id
|
| 458 |
+
|
| 459 |
+
return True
|
| 460 |
+
|
| 461 |
+
# ── Stats & persistence ──────────────────────────────────────────
|
| 462 |
+
|
| 463 |
+
def get_stats(self) -> Dict:
|
| 464 |
+
node_types = defaultdict(int)
|
| 465 |
+
edge_types = defaultdict(int)
|
| 466 |
+
for _, data in self.graph.nodes(data=True):
|
| 467 |
+
node_types[data.get("node_type", "unknown")] += 1
|
| 468 |
+
for _, _, data in self.graph.edges(data=True):
|
| 469 |
+
edge_types[data.get("edge_type", "unknown")] += 1
|
| 470 |
+
|
| 471 |
+
return {
|
| 472 |
+
"total_nodes": self.graph.number_of_nodes(),
|
| 473 |
+
"total_edges": self.graph.number_of_edges(),
|
| 474 |
+
"node_types": dict(node_types),
|
| 475 |
+
"edge_types": dict(edge_types),
|
| 476 |
+
"communities": len(self.communities),
|
| 477 |
+
"cross_doc_entities": sum(1 for docs in self._entity_to_docs.values() if len(docs) > 1),
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
def save(self, output_dir: Path):
|
| 481 |
+
output_dir = Path(output_dir)
|
| 482 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 483 |
+
|
| 484 |
+
with open(output_dir / "graph_nodes.jsonl", "w", encoding="utf-8") as f:
|
| 485 |
+
for nid, data in self.graph.nodes(data=True):
|
| 486 |
+
f.write(json.dumps({"node_id": nid, **data}, ensure_ascii=False) + "\n")
|
| 487 |
+
|
| 488 |
+
with open(output_dir / "graph_edges.jsonl", "w", encoding="utf-8") as f:
|
| 489 |
+
for u, v, data in self.graph.edges(data=True):
|
| 490 |
+
f.write(json.dumps({"source": u, "target": v, **data}, ensure_ascii=False) + "\n")
|
| 491 |
+
|
| 492 |
+
with open(output_dir / "communities.json", "w", encoding="utf-8") as f:
|
| 493 |
+
json.dump([{
|
| 494 |
+
"community_id": c.community_id,
|
| 495 |
+
"level": c.level,
|
| 496 |
+
"node_ids": c.node_ids,
|
| 497 |
+
"summary": c.summary,
|
| 498 |
+
"key_entities": c.key_entities,
|
| 499 |
+
"key_topics": c.key_topics,
|
| 500 |
+
"size": c.size,
|
| 501 |
+
} for c in self.communities.values()], f, ensure_ascii=False, indent=2)
|
graphrag_v4/models.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""GraphRAG v4 - Core Data Models (Optimized)
|
| 3 |
+
|
| 4 |
+
Optimizations:
|
| 5 |
+
- Pre-compiled regex patterns (avoid re-compiling per call)
|
| 6 |
+
- normalize_text uses str.translate for speed
|
| 7 |
+
- generate_id uses hashlib once, not per-call import
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
from typing import Dict, List, Optional, Any
|
| 12 |
+
from enum import Enum
|
| 13 |
+
import hashlib
|
| 14 |
+
import re
|
| 15 |
+
|
| 16 |
+
# ── Enums ────────────────────────────────────────────────────────────
|
| 17 |
+
|
| 18 |
+
class NodeType(str, Enum):
|
| 19 |
+
DOCUMENT = "document"
|
| 20 |
+
CHUNK = "chunk"
|
| 21 |
+
ENTITY = "entity"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class EdgeType(str, Enum):
|
| 25 |
+
CONTAINS = "contains"
|
| 26 |
+
NEXT = "next"
|
| 27 |
+
MENTIONS = "mentions"
|
| 28 |
+
RELATES_TO = "relates_to"
|
| 29 |
+
CO_OCCURS = "co_occurs"
|
| 30 |
+
CROSS_DOC = "cross_doc"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class DocType(str, Enum):
|
| 34 |
+
LAW = "law"
|
| 35 |
+
ADMISSION_RULES = "admission_rules"
|
| 36 |
+
EDUCATIONAL_PROGRAM = "educational_program"
|
| 37 |
+
REGULATION = "regulation"
|
| 38 |
+
UNKNOWN = "unknown"
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# ── Dataclasses ──────────────────────────────────────────────────────
|
| 42 |
+
|
| 43 |
+
@dataclass
|
| 44 |
+
class Chunk:
|
| 45 |
+
chunk_id: str
|
| 46 |
+
doc_id: str
|
| 47 |
+
doc_type: str
|
| 48 |
+
text: str
|
| 49 |
+
title: str = ""
|
| 50 |
+
parent_context: str = ""
|
| 51 |
+
page: int = 1
|
| 52 |
+
char_start: int = 0
|
| 53 |
+
char_end: int = 0
|
| 54 |
+
text_hash: str = ""
|
| 55 |
+
|
| 56 |
+
def __post_init__(self):
|
| 57 |
+
if not self.text_hash:
|
| 58 |
+
self.text_hash = hashlib.md5(self.text.encode()).hexdigest()[:12]
|
| 59 |
+
|
| 60 |
+
@property
|
| 61 |
+
def full_text(self) -> str:
|
| 62 |
+
parts = []
|
| 63 |
+
if self.title:
|
| 64 |
+
parts.append(f"[{self.title}]")
|
| 65 |
+
if self.parent_context:
|
| 66 |
+
parts.append(f"Context: {self.parent_context}")
|
| 67 |
+
parts.append(self.text)
|
| 68 |
+
return "\n".join(parts)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
@dataclass
|
| 72 |
+
class Entity:
|
| 73 |
+
entity_id: str
|
| 74 |
+
entity_type: str
|
| 75 |
+
value: str
|
| 76 |
+
normalized: str
|
| 77 |
+
source_id: str = ""
|
| 78 |
+
confidence: float = 0.8
|
| 79 |
+
doc_ids: List[str] = field(default_factory=list)
|
| 80 |
+
embedding: Optional[List[float]] = None
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
@dataclass
|
| 84 |
+
class Relation:
|
| 85 |
+
relation_id: str
|
| 86 |
+
source_entity_id: str
|
| 87 |
+
target_entity_id: str
|
| 88 |
+
relation_type: str
|
| 89 |
+
text_evidence: str
|
| 90 |
+
confidence: float = 1.0
|
| 91 |
+
source: str = "pattern"
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
@dataclass
|
| 95 |
+
class Community:
|
| 96 |
+
community_id: str
|
| 97 |
+
level: int
|
| 98 |
+
node_ids: List[str]
|
| 99 |
+
summary: str = ""
|
| 100 |
+
key_entities: List[str] = field(default_factory=list)
|
| 101 |
+
key_topics: List[str] = field(default_factory=list)
|
| 102 |
+
summary_embedding: Optional[List[float]] = None
|
| 103 |
+
|
| 104 |
+
@property
|
| 105 |
+
def size(self) -> int:
|
| 106 |
+
return len(self.node_ids)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
@dataclass
|
| 110 |
+
class SearchResult:
|
| 111 |
+
node_id: str
|
| 112 |
+
node_type: str
|
| 113 |
+
score: float
|
| 114 |
+
text: str
|
| 115 |
+
title: str = ""
|
| 116 |
+
dense_score: float = 0.0
|
| 117 |
+
sparse_score: float = 0.0
|
| 118 |
+
graph_score: float = 0.0
|
| 119 |
+
doc_id: str = ""
|
| 120 |
+
doc_type: str = ""
|
| 121 |
+
page: int = 1
|
| 122 |
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
@dataclass
|
| 126 |
+
class QAResult:
|
| 127 |
+
question: str
|
| 128 |
+
answer: str
|
| 129 |
+
confidence: float
|
| 130 |
+
sources: List[SearchResult]
|
| 131 |
+
reasoning: List[str] = field(default_factory=list)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
# ── Utility functions (optimized) ────────────────────────────────────
|
| 135 |
+
|
| 136 |
+
_WHITESPACE_RE = re.compile(r'\s+')
|
| 137 |
+
_QUOTE_TABLE = str.maketrans({
|
| 138 |
+
"\u2018": "'", "\u2019": "'", "`": "'", "\u02BC": "'",
|
| 139 |
+
})
|
| 140 |
+
|
| 141 |
+
def normalize_text(text: str) -> str:
|
| 142 |
+
text = text.lower().strip()
|
| 143 |
+
text = _WHITESPACE_RE.sub(' ', text)
|
| 144 |
+
text = text.translate(_QUOTE_TABLE)
|
| 145 |
+
return text
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def generate_id(*parts) -> str:
|
| 149 |
+
content = ":".join(str(p) for p in parts)
|
| 150 |
+
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
# Pre-compiled doc type patterns
|
| 154 |
+
_DOC_TYPE_COMPILED = {
|
| 155 |
+
DocType.LAW: [re.compile(p, re.IGNORECASE) for p in [r"закон\s+україни", r"кодекс"]],
|
| 156 |
+
DocType.ADMISSION_RULES: [re.compile(p, re.IGNORECASE) for p in [r"правила?\s+прийому", r"умови\s+прийому", r"вступ"]],
|
| 157 |
+
DocType.EDUCATIONAL_PROGRAM: [re.compile(p, re.IGNORECASE) for p in [r"освітн[ьяі]+\s*програм", r"навчальн\w+\s*план"]],
|
| 158 |
+
DocType.REGULATION: [re.compile(p, re.IGNORECASE) for p in [r"положення\s+про", r"регламент", r"порядок"]],
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
def detect_doc_type(filename: str, text_sample: str) -> DocType:
|
| 162 |
+
combined = f"{filename} {text_sample[:2000]}".lower()
|
| 163 |
+
for doc_type, patterns in _DOC_TYPE_COMPILED.items():
|
| 164 |
+
for pattern in patterns:
|
| 165 |
+
if pattern.search(combined):
|
| 166 |
+
return doc_type
|
| 167 |
+
return DocType.UNKNOWN
|
graphrag_v4/qa.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""GraphRAG v4 - Question Answering System"""
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
import time
|
| 7 |
+
from typing import Dict, List, Optional
|
| 8 |
+
|
| 9 |
+
from .models import QAResult, SearchResult
|
| 10 |
+
from .retriever import HybridRetriever
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class LLMClient:
|
| 14 |
+
SYSTEM_PROMPT = """You are an expert on Ukrainian legal and educational documents.
|
| 15 |
+
Answer precisely based on the provided context.
|
| 16 |
+
Cite sources (e.g., "according to p. 3.2").
|
| 17 |
+
If context is insufficient, clearly state this."""
|
| 18 |
+
|
| 19 |
+
def __init__(self, api_key: str = None, model: str = "gpt-4o-mini", base_url: str = None):
|
| 20 |
+
self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
| 21 |
+
self.model = model
|
| 22 |
+
self.base_url = base_url
|
| 23 |
+
self.client = None
|
| 24 |
+
|
| 25 |
+
if self.api_key:
|
| 26 |
+
try:
|
| 27 |
+
import openai
|
| 28 |
+
self.client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
|
| 29 |
+
except ImportError:
|
| 30 |
+
pass
|
| 31 |
+
|
| 32 |
+
@property
|
| 33 |
+
def available(self) -> bool:
|
| 34 |
+
return self.client is not None
|
| 35 |
+
|
| 36 |
+
def complete(self, prompt: str, system_prompt: str = None, temperature: float = 0.3, max_tokens: int = 1000) -> Optional[str]:
|
| 37 |
+
if not self.client:
|
| 38 |
+
return None
|
| 39 |
+
|
| 40 |
+
messages = []
|
| 41 |
+
if system_prompt:
|
| 42 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 43 |
+
messages.append({"role": "user", "content": prompt})
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
response = self.client.chat.completions.create(
|
| 47 |
+
model=self.model,
|
| 48 |
+
messages=messages,
|
| 49 |
+
temperature=temperature,
|
| 50 |
+
max_tokens=max_tokens,
|
| 51 |
+
)
|
| 52 |
+
return response.choices[0].message.content
|
| 53 |
+
except Exception:
|
| 54 |
+
return None
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
QA_PROMPT = """Answer the question based on the provided context.
|
| 58 |
+
|
| 59 |
+
Question: {question}
|
| 60 |
+
|
| 61 |
+
Context:
|
| 62 |
+
{context}
|
| 63 |
+
|
| 64 |
+
Instructions:
|
| 65 |
+
1. Answer directly and specifically
|
| 66 |
+
2. Cite relevant sections (p. X.X, art. Y)
|
| 67 |
+
3. If context is insufficient, state this
|
| 68 |
+
|
| 69 |
+
Answer:"""
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class GraphRAGQA:
|
| 73 |
+
def __init__(self, retriever: HybridRetriever, llm_client: LLMClient = None):
|
| 74 |
+
self.retriever = retriever
|
| 75 |
+
self.llm = llm_client
|
| 76 |
+
|
| 77 |
+
def answer(self, question: str, top_k: int = 8, use_communities: bool = True) -> QAResult:
|
| 78 |
+
start_time = time.time()
|
| 79 |
+
reasoning = []
|
| 80 |
+
|
| 81 |
+
results = self.retriever.search(question, top_k=top_k, expand_graph=True)
|
| 82 |
+
reasoning.append(f"Found {len(results)} results")
|
| 83 |
+
|
| 84 |
+
communities = []
|
| 85 |
+
if use_communities:
|
| 86 |
+
communities = self.retriever.search_communities(question, top_k=2)
|
| 87 |
+
if communities:
|
| 88 |
+
reasoning.append(f"Communities: {len(communities)}")
|
| 89 |
+
|
| 90 |
+
context = self._build_context(results)
|
| 91 |
+
community_context = self._build_community_context(communities)
|
| 92 |
+
|
| 93 |
+
if self.llm and self.llm.available:
|
| 94 |
+
full_context = context
|
| 95 |
+
if community_context:
|
| 96 |
+
full_context += f"\n\nGeneral context:\n{community_context}"
|
| 97 |
+
|
| 98 |
+
prompt = QA_PROMPT.format(question=question, context=full_context)
|
| 99 |
+
answer = self.llm.complete(prompt, system_prompt=LLMClient.SYSTEM_PROMPT)
|
| 100 |
+
if not answer:
|
| 101 |
+
answer = self._fallback_answer(results, communities)
|
| 102 |
+
else:
|
| 103 |
+
answer = self._fallback_answer(results, communities)
|
| 104 |
+
|
| 105 |
+
confidence = self._calculate_confidence(results, answer)
|
| 106 |
+
elapsed = time.time() - start_time
|
| 107 |
+
reasoning.append(f"Time: {elapsed:.2f}s, Confidence: {confidence:.0%}")
|
| 108 |
+
|
| 109 |
+
return QAResult(
|
| 110 |
+
question=question,
|
| 111 |
+
answer=answer,
|
| 112 |
+
confidence=confidence,
|
| 113 |
+
sources=results,
|
| 114 |
+
reasoning=reasoning,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
def _build_context(self, results: List[SearchResult], max_chars: int = 4000) -> str:
|
| 118 |
+
if not results:
|
| 119 |
+
return "No relevant context found."
|
| 120 |
+
|
| 121 |
+
parts = []
|
| 122 |
+
total_chars = 0
|
| 123 |
+
|
| 124 |
+
for i, result in enumerate(results, 1):
|
| 125 |
+
part = f"{i}. [{result.title}] (p. {result.page})\n{result.text[:600]}"
|
| 126 |
+
if total_chars + len(part) > max_chars:
|
| 127 |
+
break
|
| 128 |
+
parts.append(part)
|
| 129 |
+
total_chars += len(part)
|
| 130 |
+
|
| 131 |
+
return "\n\n".join(parts)
|
| 132 |
+
|
| 133 |
+
def _build_community_context(self, communities: List[Dict]) -> str:
|
| 134 |
+
if not communities:
|
| 135 |
+
return ""
|
| 136 |
+
return "\n".join(f"• {c.get('summary', '')}" for c in communities[:2] if c.get("summary"))
|
| 137 |
+
|
| 138 |
+
def _fallback_answer(self, results: List[SearchResult], communities: List[Dict]) -> str:
|
| 139 |
+
if not results:
|
| 140 |
+
return "No relevant information found. Please try rephrasing your question."
|
| 141 |
+
|
| 142 |
+
parts = ["Based on found context:\n"]
|
| 143 |
+
|
| 144 |
+
for i, result in enumerate(results[:3], 1):
|
| 145 |
+
parts.append(f"{i}. **{result.title}** (p. {result.page})")
|
| 146 |
+
if result.text:
|
| 147 |
+
parts.append(f" {result.text[:300]}...")
|
| 148 |
+
parts.append(f" [relevance: {result.score:.3f}]\n")
|
| 149 |
+
|
| 150 |
+
if communities:
|
| 151 |
+
parts.append("\nGeneral context:")
|
| 152 |
+
for comm in communities[:2]:
|
| 153 |
+
if comm.get("summary"):
|
| 154 |
+
parts.append(f"• {comm['summary'][:200]}")
|
| 155 |
+
|
| 156 |
+
return "\n".join(parts)
|
| 157 |
+
|
| 158 |
+
def _calculate_confidence(self, results: List[SearchResult], answer: str) -> float:
|
| 159 |
+
confidence = 0.3
|
| 160 |
+
|
| 161 |
+
if len(results) >= 5:
|
| 162 |
+
confidence += 0.15
|
| 163 |
+
elif len(results) >= 2:
|
| 164 |
+
confidence += 0.08
|
| 165 |
+
|
| 166 |
+
if results and results[0].score > 0.02:
|
| 167 |
+
confidence += 0.15
|
| 168 |
+
|
| 169 |
+
hybrid_count = sum(1 for r in results if r.dense_score > 0 and r.sparse_score > 0)
|
| 170 |
+
if hybrid_count >= 2:
|
| 171 |
+
confidence += 0.15
|
| 172 |
+
|
| 173 |
+
_CITATION_RE = [r"п\.\s*\d", r"ст\.\s*\d", r"згідно", r"відповідно"]
|
| 174 |
+
for pattern in _CITATION_RE:
|
| 175 |
+
if re.search(pattern, answer, re.IGNORECASE):
|
| 176 |
+
confidence += 0.03
|
| 177 |
+
|
| 178 |
+
if len(answer) > 200:
|
| 179 |
+
confidence += 0.05
|
| 180 |
+
|
| 181 |
+
return min(confidence, 0.95)
|
graphrag_v4/retriever.py
ADDED
|
@@ -0,0 +1,445 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""GraphRAG v4 - Hybrid Retriever (Optimized)
|
| 3 |
+
|
| 4 |
+
Optimizations:
|
| 5 |
+
- SparseIndex: inverted index (token → doc_ids) for O(|query_tokens| * avg_postings) search
|
| 6 |
+
- DenseIndex: batch add_vectors, single FAISS call
|
| 7 |
+
- Community search: pre-normalized embedding matrix, vectorized dot product
|
| 8 |
+
- Graph expansion: only expand chunk-type neighbors
|
| 9 |
+
- Safe edge_data handling (returns None not {})
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
import pickle
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Dict, List, Tuple
|
| 16 |
+
from collections import defaultdict
|
| 17 |
+
import numpy as np
|
| 18 |
+
|
| 19 |
+
from .models import SearchResult, Community, EdgeType
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class SparseIndex:
|
| 23 |
+
"""Inverted sparse index for O(query_tokens × avg_postings) lookup."""
|
| 24 |
+
|
| 25 |
+
def __init__(self):
|
| 26 |
+
self.doc_ids: List[str] = []
|
| 27 |
+
self.doc_data: Dict[str, Dict] = {}
|
| 28 |
+
self.sparse_vectors: Dict[str, Dict[int, float]] = {}
|
| 29 |
+
# Inverted index: token_id → [(doc_id, weight)]
|
| 30 |
+
self._inverted: Dict[int, List[Tuple[str, float]]] = defaultdict(list)
|
| 31 |
+
self._dirty = False
|
| 32 |
+
|
| 33 |
+
def add_document(self, doc_id: str, data: Dict = None, sparse_weights: Dict[int, float] = None):
|
| 34 |
+
self.doc_ids.append(doc_id)
|
| 35 |
+
self.doc_data[doc_id] = data or {}
|
| 36 |
+
if sparse_weights:
|
| 37 |
+
self.sparse_vectors[doc_id] = sparse_weights
|
| 38 |
+
for token_id, weight in sparse_weights.items():
|
| 39 |
+
self._inverted[token_id].append((doc_id, weight))
|
| 40 |
+
|
| 41 |
+
def search(self, query_sparse: Dict[int, float], top_k: int = 100) -> List[Tuple[str, float]]:
|
| 42 |
+
if not self._inverted or not query_sparse:
|
| 43 |
+
return []
|
| 44 |
+
|
| 45 |
+
scores = defaultdict(float)
|
| 46 |
+
for token_id, q_weight in query_sparse.items():
|
| 47 |
+
if token_id in self._inverted:
|
| 48 |
+
for doc_id, d_weight in self._inverted[token_id]:
|
| 49 |
+
scores[doc_id] += q_weight * d_weight
|
| 50 |
+
|
| 51 |
+
if not scores:
|
| 52 |
+
return []
|
| 53 |
+
|
| 54 |
+
# Partial sort: only need top_k
|
| 55 |
+
items = list(scores.items())
|
| 56 |
+
if len(items) <= top_k:
|
| 57 |
+
items.sort(key=lambda x: -x[1])
|
| 58 |
+
return items
|
| 59 |
+
# Use numpy argpartition for large result sets
|
| 60 |
+
vals = np.array([s for _, s in items])
|
| 61 |
+
top_indices = np.argpartition(vals, -top_k)[-top_k:]
|
| 62 |
+
top_indices = top_indices[np.argsort(-vals[top_indices])]
|
| 63 |
+
return [(items[i][0], items[i][1]) for i in top_indices]
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class DenseIndex:
|
| 67 |
+
def __init__(self, dimension: int = 1024):
|
| 68 |
+
self.dimension = dimension
|
| 69 |
+
self.index = None
|
| 70 |
+
self.doc_ids: List[str] = []
|
| 71 |
+
self.doc_data: Dict[str, Dict] = {}
|
| 72 |
+
self._faiss_available = False
|
| 73 |
+
self._pending_vecs = []
|
| 74 |
+
self._pending_ids = []
|
| 75 |
+
self._pending_data = []
|
| 76 |
+
|
| 77 |
+
try:
|
| 78 |
+
import faiss
|
| 79 |
+
self._faiss_available = True
|
| 80 |
+
self.index = faiss.IndexHNSWFlat(dimension, 32)
|
| 81 |
+
self.index.hnsw.efConstruction = 200
|
| 82 |
+
self.index.hnsw.efSearch = 128
|
| 83 |
+
except ImportError:
|
| 84 |
+
pass
|
| 85 |
+
|
| 86 |
+
def add_vector(self, doc_id: str, embedding: np.ndarray, data: Dict = None):
|
| 87 |
+
"""Buffer vectors for batch add."""
|
| 88 |
+
if not self._faiss_available:
|
| 89 |
+
return
|
| 90 |
+
self._pending_vecs.append(embedding)
|
| 91 |
+
self._pending_ids.append(doc_id)
|
| 92 |
+
self._pending_data.append(data or {})
|
| 93 |
+
|
| 94 |
+
def flush(self):
|
| 95 |
+
"""Batch-add all pending vectors to FAISS (much faster than one-at-a-time)."""
|
| 96 |
+
if not self._faiss_available or not self._pending_vecs:
|
| 97 |
+
return
|
| 98 |
+
|
| 99 |
+
vecs = np.array(self._pending_vecs, dtype=np.float32)
|
| 100 |
+
norms = np.linalg.norm(vecs, axis=1, keepdims=True)
|
| 101 |
+
norms[norms == 0] = 1.0
|
| 102 |
+
vecs = vecs / norms
|
| 103 |
+
|
| 104 |
+
self.index.add(vecs)
|
| 105 |
+
self.doc_ids.extend(self._pending_ids)
|
| 106 |
+
for doc_id, data in zip(self._pending_ids, self._pending_data):
|
| 107 |
+
self.doc_data[doc_id] = data
|
| 108 |
+
|
| 109 |
+
self._pending_vecs.clear()
|
| 110 |
+
self._pending_ids.clear()
|
| 111 |
+
self._pending_data.clear()
|
| 112 |
+
|
| 113 |
+
def search(self, query_embedding: np.ndarray, top_k: int = 100) -> List[Tuple[str, float]]:
|
| 114 |
+
self.flush() # ensure all vectors are indexed
|
| 115 |
+
if not self._faiss_available or self.index.ntotal == 0:
|
| 116 |
+
return []
|
| 117 |
+
|
| 118 |
+
vec = query_embedding.astype(np.float32).reshape(1, -1)
|
| 119 |
+
norm = np.linalg.norm(vec)
|
| 120 |
+
if norm > 0:
|
| 121 |
+
vec = vec / norm
|
| 122 |
+
|
| 123 |
+
scores, indices = self.index.search(vec, min(top_k, self.index.ntotal))
|
| 124 |
+
|
| 125 |
+
results = []
|
| 126 |
+
for score, idx in zip(scores[0], indices[0]):
|
| 127 |
+
if 0 <= idx < len(self.doc_ids):
|
| 128 |
+
results.append((self.doc_ids[idx], float(score)))
|
| 129 |
+
|
| 130 |
+
return results
|
| 131 |
+
|
| 132 |
+
def save(self, path: Path):
|
| 133 |
+
self.flush()
|
| 134 |
+
if not self._faiss_available:
|
| 135 |
+
return
|
| 136 |
+
import faiss
|
| 137 |
+
faiss.write_index(self.index, str(path.with_suffix(".faiss")))
|
| 138 |
+
with open(path.with_suffix(".meta.pkl"), "wb") as f:
|
| 139 |
+
pickle.dump({"doc_ids": self.doc_ids, "doc_data": self.doc_data, "dimension": self.dimension}, f)
|
| 140 |
+
|
| 141 |
+
def load(self, path: Path):
|
| 142 |
+
if not self._faiss_available:
|
| 143 |
+
return
|
| 144 |
+
import faiss
|
| 145 |
+
self.index = faiss.read_index(str(path.with_suffix(".faiss")))
|
| 146 |
+
with open(path.with_suffix(".meta.pkl"), "rb") as f:
|
| 147 |
+
meta = pickle.load(f)
|
| 148 |
+
self.doc_ids = meta["doc_ids"]
|
| 149 |
+
self.doc_data = meta["doc_data"]
|
| 150 |
+
self.dimension = meta["dimension"]
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
class HybridRetriever:
|
| 154 |
+
def __init__(self, graph=None, communities: Dict[str, Community] = None):
|
| 155 |
+
self.graph = graph
|
| 156 |
+
self.communities = communities or {}
|
| 157 |
+
self.node_to_community = {}
|
| 158 |
+
self.dense_index = DenseIndex()
|
| 159 |
+
self.sparse_index = SparseIndex()
|
| 160 |
+
self._embedder = None
|
| 161 |
+
# Pre-normalized community embedding matrix for vectorized search
|
| 162 |
+
self._comm_ids: List[str] = []
|
| 163 |
+
self._comm_matrix: np.ndarray = None # shape (n_communities, dim), pre-normalized
|
| 164 |
+
|
| 165 |
+
@property
|
| 166 |
+
def embedder(self):
|
| 167 |
+
if self._embedder is None:
|
| 168 |
+
from .embeddings import get_embedder
|
| 169 |
+
self._embedder = get_embedder()
|
| 170 |
+
return self._embedder
|
| 171 |
+
|
| 172 |
+
def index_chunks(self, chunks: List[Dict], batch_size: int = 64):
|
| 173 |
+
for i in range(0, len(chunks), batch_size):
|
| 174 |
+
batch = chunks[i:i + batch_size]
|
| 175 |
+
|
| 176 |
+
texts = []
|
| 177 |
+
for chunk in batch:
|
| 178 |
+
parts = []
|
| 179 |
+
if chunk.get("title"):
|
| 180 |
+
parts.append(f"[{chunk['title']}]")
|
| 181 |
+
if chunk.get("parent_context"):
|
| 182 |
+
parts.append(f"Context: {chunk['parent_context']}")
|
| 183 |
+
parts.append(chunk.get("text", ""))
|
| 184 |
+
texts.append("\n".join(parts))
|
| 185 |
+
|
| 186 |
+
embeddings = self.embedder.embed_batch(texts, return_sparse=True, batch_size=batch_size)
|
| 187 |
+
|
| 188 |
+
for chunk, emb in zip(batch, embeddings):
|
| 189 |
+
chunk_id = chunk.get("chunk_id")
|
| 190 |
+
self.dense_index.add_vector(chunk_id, emb.dense, chunk)
|
| 191 |
+
self.sparse_index.add_document(chunk_id, chunk, sparse_weights=emb.sparse)
|
| 192 |
+
|
| 193 |
+
# Flush batch to FAISS
|
| 194 |
+
self.dense_index.flush()
|
| 195 |
+
|
| 196 |
+
def index_communities(self, communities: Dict[str, Community]):
|
| 197 |
+
"""Index communities: build pre-normalized embedding matrix for vectorized search."""
|
| 198 |
+
self.communities = communities
|
| 199 |
+
for comm in communities.values():
|
| 200 |
+
for nid in comm.node_ids:
|
| 201 |
+
self.node_to_community[nid] = comm.community_id
|
| 202 |
+
|
| 203 |
+
summaries = []
|
| 204 |
+
comm_ids = []
|
| 205 |
+
for comm in communities.values():
|
| 206 |
+
if comm.summary:
|
| 207 |
+
summaries.append(comm.summary)
|
| 208 |
+
comm_ids.append(comm.community_id)
|
| 209 |
+
|
| 210 |
+
if summaries:
|
| 211 |
+
try:
|
| 212 |
+
embeddings = self.embedder.embed_batch(summaries, return_sparse=False)
|
| 213 |
+
matrix = np.array([e.dense for e in embeddings], dtype=np.float32)
|
| 214 |
+
norms = np.linalg.norm(matrix, axis=1, keepdims=True)
|
| 215 |
+
norms[norms == 0] = 1.0
|
| 216 |
+
self._comm_matrix = matrix / norms
|
| 217 |
+
self._comm_ids = comm_ids
|
| 218 |
+
except Exception:
|
| 219 |
+
self._comm_matrix = None
|
| 220 |
+
self._comm_ids = []
|
| 221 |
+
|
| 222 |
+
def search(
|
| 223 |
+
self,
|
| 224 |
+
query: str,
|
| 225 |
+
top_k: int = 10,
|
| 226 |
+
expand_graph: bool = True,
|
| 227 |
+
weights: Tuple[float, float, float] = (0.5, 0.3, 0.2),
|
| 228 |
+
) -> List[SearchResult]:
|
| 229 |
+
w_dense, w_sparse, w_graph = weights
|
| 230 |
+
|
| 231 |
+
query_emb = self.embedder.embed(query, return_sparse=True)
|
| 232 |
+
|
| 233 |
+
dense_results = self.dense_index.search(query_emb.dense, top_k=top_k * 3)
|
| 234 |
+
sparse_results = self.sparse_index.search(query_emb.sparse, top_k=top_k * 3) if query_emb.sparse else []
|
| 235 |
+
|
| 236 |
+
k = 60 # RRF constant
|
| 237 |
+
rankings = {}
|
| 238 |
+
|
| 239 |
+
for rank, (doc_id, score) in enumerate(dense_results):
|
| 240 |
+
rankings[doc_id] = {
|
| 241 |
+
"dense_score": score, "dense_rank": rank + 1,
|
| 242 |
+
"sparse_score": 0.0, "sparse_rank": None, "graph_score": 0.0,
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
for rank, (doc_id, score) in enumerate(sparse_results):
|
| 246 |
+
if doc_id not in rankings:
|
| 247 |
+
rankings[doc_id] = {
|
| 248 |
+
"dense_score": 0.0, "dense_rank": None,
|
| 249 |
+
"sparse_score": score, "sparse_rank": rank + 1, "graph_score": 0.0,
|
| 250 |
+
}
|
| 251 |
+
else:
|
| 252 |
+
rankings[doc_id]["sparse_score"] = score
|
| 253 |
+
rankings[doc_id]["sparse_rank"] = rank + 1
|
| 254 |
+
|
| 255 |
+
results = []
|
| 256 |
+
for doc_id, scores in rankings.items():
|
| 257 |
+
rrf_score = 0.0
|
| 258 |
+
if scores["dense_rank"]:
|
| 259 |
+
rrf_score += w_dense / (k + scores["dense_rank"])
|
| 260 |
+
if scores["sparse_rank"]:
|
| 261 |
+
rrf_score += w_sparse / (k + scores["sparse_rank"])
|
| 262 |
+
if scores["dense_rank"] and scores["sparse_rank"]:
|
| 263 |
+
rrf_score *= 1.2
|
| 264 |
+
|
| 265 |
+
data = self.sparse_index.doc_data.get(doc_id, {})
|
| 266 |
+
results.append(SearchResult(
|
| 267 |
+
node_id=doc_id,
|
| 268 |
+
node_type=data.get("node_type", "chunk"),
|
| 269 |
+
score=rrf_score,
|
| 270 |
+
text=data.get("text", ""),
|
| 271 |
+
title=data.get("title", ""),
|
| 272 |
+
dense_score=scores["dense_score"],
|
| 273 |
+
sparse_score=scores["sparse_score"],
|
| 274 |
+
graph_score=scores["graph_score"],
|
| 275 |
+
doc_id=data.get("doc_id", ""),
|
| 276 |
+
doc_type=data.get("doc_type", ""),
|
| 277 |
+
page=data.get("page", 1),
|
| 278 |
+
metadata=data,
|
| 279 |
+
))
|
| 280 |
+
|
| 281 |
+
results.sort(key=lambda x: -x.score)
|
| 282 |
+
results = results[:top_k]
|
| 283 |
+
|
| 284 |
+
if expand_graph and self.graph:
|
| 285 |
+
expanded = self._expand_from_graph(results[:3])
|
| 286 |
+
seen_ids = {r.node_id for r in results}
|
| 287 |
+
for node_id, graph_score, data in expanded:
|
| 288 |
+
if node_id not in seen_ids and len(results) < top_k * 2:
|
| 289 |
+
results.append(SearchResult(
|
| 290 |
+
node_id=node_id,
|
| 291 |
+
node_type=data.get("node_type", "chunk"),
|
| 292 |
+
score=graph_score * w_graph,
|
| 293 |
+
text=data.get("text", ""),
|
| 294 |
+
title=data.get("title", ""),
|
| 295 |
+
graph_score=graph_score,
|
| 296 |
+
doc_id=data.get("doc_id", ""),
|
| 297 |
+
doc_type=data.get("doc_type", ""),
|
| 298 |
+
page=data.get("page", 1),
|
| 299 |
+
metadata=data,
|
| 300 |
+
))
|
| 301 |
+
seen_ids.add(node_id)
|
| 302 |
+
|
| 303 |
+
return results[:top_k]
|
| 304 |
+
|
| 305 |
+
def _expand_from_graph(self, seed_results: List[SearchResult], max_expanded: int = 10) -> List[Tuple[str, float, Dict]]:
|
| 306 |
+
if self.graph is None:
|
| 307 |
+
return []
|
| 308 |
+
|
| 309 |
+
expanded = []
|
| 310 |
+
visited = set()
|
| 311 |
+
|
| 312 |
+
for result in seed_results:
|
| 313 |
+
node_id = result.node_id
|
| 314 |
+
if node_id not in self.graph:
|
| 315 |
+
continue
|
| 316 |
+
visited.add(node_id)
|
| 317 |
+
|
| 318 |
+
for neighbor in self.graph.neighbors(node_id):
|
| 319 |
+
if neighbor in visited:
|
| 320 |
+
continue
|
| 321 |
+
visited.add(neighbor)
|
| 322 |
+
|
| 323 |
+
neighbor_data = dict(self.graph.nodes.get(neighbor, {}))
|
| 324 |
+
# Only expand to chunk nodes (entities/docs don't have searchable text)
|
| 325 |
+
if neighbor_data.get("node_type") != "chunk":
|
| 326 |
+
continue
|
| 327 |
+
|
| 328 |
+
edge_data = self.graph.get_edge_data(node_id, neighbor) or {}
|
| 329 |
+
edge_type = edge_data.get("edge_type", "")
|
| 330 |
+
edge_w = edge_data.get("weight", 1.0)
|
| 331 |
+
|
| 332 |
+
cross_doc_boost = 1.5 if edge_type == EdgeType.CROSS_DOC.value else 1.0
|
| 333 |
+
graph_score = result.score * 0.5 * edge_w * cross_doc_boost
|
| 334 |
+
|
| 335 |
+
expanded.append((neighbor, graph_score, neighbor_data))
|
| 336 |
+
|
| 337 |
+
expanded.sort(key=lambda x: -x[1])
|
| 338 |
+
return expanded[:max_expanded]
|
| 339 |
+
|
| 340 |
+
def search_communities(self, query: str, top_k: int = 3) -> List[Dict]:
|
| 341 |
+
"""Vectorized community search using pre-normalized embedding matrix."""
|
| 342 |
+
if not self.communities:
|
| 343 |
+
return []
|
| 344 |
+
|
| 345 |
+
if self._comm_matrix is not None and len(self._comm_ids) > 0:
|
| 346 |
+
query_emb = self.embedder.embed(query, return_sparse=False)
|
| 347 |
+
q_vec = query_emb.dense.astype(np.float32)
|
| 348 |
+
q_norm = np.linalg.norm(q_vec)
|
| 349 |
+
if q_norm > 0:
|
| 350 |
+
q_vec = q_vec / q_norm
|
| 351 |
+
|
| 352 |
+
# Single matrix-vector multiply: (n_communities, dim) @ (dim,) → (n_communities,)
|
| 353 |
+
sims = self._comm_matrix @ q_vec
|
| 354 |
+
|
| 355 |
+
# Get top-k indices
|
| 356 |
+
if len(sims) <= top_k:
|
| 357 |
+
top_indices = np.argsort(-sims)
|
| 358 |
+
else:
|
| 359 |
+
top_indices = np.argpartition(sims, -top_k)[-top_k:]
|
| 360 |
+
top_indices = top_indices[np.argsort(-sims[top_indices])]
|
| 361 |
+
|
| 362 |
+
results = []
|
| 363 |
+
for idx in top_indices:
|
| 364 |
+
sim = float(sims[idx])
|
| 365 |
+
if sim <= 0.1:
|
| 366 |
+
continue
|
| 367 |
+
comm_id = self._comm_ids[idx]
|
| 368 |
+
if comm_id in self.communities:
|
| 369 |
+
comm = self.communities[comm_id]
|
| 370 |
+
results.append({
|
| 371 |
+
"community_id": comm_id,
|
| 372 |
+
"summary": comm.summary,
|
| 373 |
+
"key_entities": comm.key_entities,
|
| 374 |
+
"key_topics": comm.key_topics,
|
| 375 |
+
"size": comm.size,
|
| 376 |
+
"similarity": sim,
|
| 377 |
+
})
|
| 378 |
+
return results
|
| 379 |
+
|
| 380 |
+
# Fallback: keyword matching
|
| 381 |
+
query_lower = query.lower()
|
| 382 |
+
scored = []
|
| 383 |
+
for comm in self.communities.values():
|
| 384 |
+
text = (comm.summary + " " + " ".join(comm.key_entities)).lower()
|
| 385 |
+
matches = sum(1 for word in query_lower.split() if len(word) > 3 and word in text)
|
| 386 |
+
if matches > 0:
|
| 387 |
+
scored.append((comm, matches))
|
| 388 |
+
|
| 389 |
+
scored.sort(key=lambda x: -x[1])
|
| 390 |
+
return [{
|
| 391 |
+
"community_id": comm.community_id,
|
| 392 |
+
"summary": comm.summary,
|
| 393 |
+
"key_entities": comm.key_entities,
|
| 394 |
+
"key_topics": comm.key_topics,
|
| 395 |
+
"size": comm.size,
|
| 396 |
+
} for comm, _ in scored[:top_k]]
|
| 397 |
+
|
| 398 |
+
def save(self, output_dir: Path):
|
| 399 |
+
output_dir = Path(output_dir)
|
| 400 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 401 |
+
self.dense_index.save(output_dir / "dense_index")
|
| 402 |
+
with open(output_dir / "sparse_index.pkl", "wb") as f:
|
| 403 |
+
pickle.dump({
|
| 404 |
+
"doc_ids": self.sparse_index.doc_ids,
|
| 405 |
+
"doc_data": self.sparse_index.doc_data,
|
| 406 |
+
"sparse_vectors": self.sparse_index.sparse_vectors,
|
| 407 |
+
}, f)
|
| 408 |
+
if self._comm_matrix is not None:
|
| 409 |
+
with open(output_dir / "community_embeddings.pkl", "wb") as f:
|
| 410 |
+
pickle.dump({"ids": self._comm_ids, "matrix": self._comm_matrix}, f)
|
| 411 |
+
|
| 412 |
+
def load(self, index_dir: Path):
|
| 413 |
+
index_dir = Path(index_dir)
|
| 414 |
+
dense_path = index_dir / "dense_index"
|
| 415 |
+
if dense_path.with_suffix(".faiss").exists():
|
| 416 |
+
self.dense_index.load(dense_path)
|
| 417 |
+
|
| 418 |
+
sparse_path = index_dir / "sparse_index.pkl"
|
| 419 |
+
if sparse_path.exists():
|
| 420 |
+
with open(sparse_path, "rb") as f:
|
| 421 |
+
data = pickle.load(f)
|
| 422 |
+
self.sparse_index.doc_ids = data["doc_ids"]
|
| 423 |
+
self.sparse_index.doc_data = data["doc_data"]
|
| 424 |
+
self.sparse_index.sparse_vectors = data.get("sparse_vectors", {})
|
| 425 |
+
# Rebuild inverted index
|
| 426 |
+
for doc_id, weights in self.sparse_index.sparse_vectors.items():
|
| 427 |
+
for token_id, weight in weights.items():
|
| 428 |
+
self.sparse_index._inverted[token_id].append((doc_id, weight))
|
| 429 |
+
|
| 430 |
+
comm_emb_path = index_dir / "community_embeddings.pkl"
|
| 431 |
+
if comm_emb_path.exists():
|
| 432 |
+
with open(comm_emb_path, "rb") as f:
|
| 433 |
+
comm_data = pickle.load(f)
|
| 434 |
+
if isinstance(comm_data, dict) and "matrix" in comm_data:
|
| 435 |
+
self._comm_ids = comm_data["ids"]
|
| 436 |
+
self._comm_matrix = comm_data["matrix"]
|
| 437 |
+
else:
|
| 438 |
+
# Backwards compatibility with old format
|
| 439 |
+
self._comm_ids = list(comm_data.keys())
|
| 440 |
+
vecs = list(comm_data.values())
|
| 441 |
+
if vecs:
|
| 442 |
+
matrix = np.array(vecs, dtype=np.float32)
|
| 443 |
+
norms = np.linalg.norm(matrix, axis=1, keepdims=True)
|
| 444 |
+
norms[norms == 0] = 1.0
|
| 445 |
+
self._comm_matrix = matrix / norms
|
graphrag_v4/visualization.py
ADDED
|
@@ -0,0 +1,719 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""GraphRAG v3 - Knowledge Graph Visualization
|
| 3 |
+
|
| 4 |
+
Interactive visualization for large graphs (10k+ nodes) using:
|
| 5 |
+
- WebGL rendering via vis-network
|
| 6 |
+
- Force-directed layout with Barnes-Hut optimization
|
| 7 |
+
- Community-based coloring
|
| 8 |
+
- Hierarchical filtering
|
| 9 |
+
- Search and zoom
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Dict, List, Optional
|
| 15 |
+
from collections import defaultdict
|
| 16 |
+
import colorsys
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def generate_colors(n: int) -> List[str]:
|
| 20 |
+
"""Generate n visually distinct colors."""
|
| 21 |
+
if n == 0:
|
| 22 |
+
return ["#4a90d9"]
|
| 23 |
+
colors = []
|
| 24 |
+
for i in range(n):
|
| 25 |
+
hue = i / n
|
| 26 |
+
sat = 0.7 + (i % 3) * 0.1
|
| 27 |
+
val = 0.8 + (i % 2) * 0.1
|
| 28 |
+
rgb = colorsys.hsv_to_rgb(hue, sat, val)
|
| 29 |
+
colors.append(f"#{int(rgb[0] * 255):02x}{int(rgb[1] * 255):02x}{int(rgb[2] * 255):02x}")
|
| 30 |
+
return colors
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
NODE_TYPE_CONFIG = {
|
| 34 |
+
"document": {"shape": "diamond", "color": "#e94560", "size": 25},
|
| 35 |
+
"chunk": {"shape": "dot", "color": "#4a90d9", "size": 12},
|
| 36 |
+
"entity": {"shape": "triangle", "color": "#4ad94a", "size": 15},
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
EDGE_TYPE_COLORS = {
|
| 40 |
+
"contains": "#888888",
|
| 41 |
+
"next": "#666666",
|
| 42 |
+
"mentions": "#4a90d9",
|
| 43 |
+
"relates_to": "#e94560",
|
| 44 |
+
"co_occurs": "#4ad94a",
|
| 45 |
+
"cross_doc": "#ff9800", # Orange — highlight cross-document links
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class GraphVisualizer:
|
| 50 |
+
"""Create interactive HTML visualization for large knowledge graphs."""
|
| 51 |
+
|
| 52 |
+
def __init__(self, graph, communities: Dict = None):
|
| 53 |
+
self.graph = graph
|
| 54 |
+
self.communities = communities or {}
|
| 55 |
+
self.node_to_community = {}
|
| 56 |
+
|
| 57 |
+
if self.communities:
|
| 58 |
+
for comm_id, comm in self.communities.items():
|
| 59 |
+
if hasattr(comm, 'node_ids'):
|
| 60 |
+
for nid in comm.node_ids:
|
| 61 |
+
self.node_to_community[nid] = comm_id
|
| 62 |
+
elif isinstance(comm, dict) and 'node_ids' in comm:
|
| 63 |
+
for nid in comm['node_ids']:
|
| 64 |
+
self.node_to_community[nid] = comm_id
|
| 65 |
+
|
| 66 |
+
def create_visualization(
|
| 67 |
+
self,
|
| 68 |
+
output_path: Path,
|
| 69 |
+
max_nodes: int = 5000,
|
| 70 |
+
min_degree: int = 0,
|
| 71 |
+
show_labels: bool = True,
|
| 72 |
+
physics_enabled: bool = True,
|
| 73 |
+
):
|
| 74 |
+
"""
|
| 75 |
+
Create interactive HTML visualization.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
output_path: Where to save HTML file
|
| 79 |
+
max_nodes: Maximum nodes to display (sampled by importance)
|
| 80 |
+
min_degree: Minimum degree to include node
|
| 81 |
+
show_labels: Show node labels
|
| 82 |
+
physics_enabled: Enable physics simulation
|
| 83 |
+
"""
|
| 84 |
+
nodes_data, edges_data = self._prepare_data(max_nodes, min_degree)
|
| 85 |
+
|
| 86 |
+
if not nodes_data:
|
| 87 |
+
print("Warning: No nodes to visualize")
|
| 88 |
+
nodes_data = [{"id": "empty", "node_type": "chunk", "degree": 0}]
|
| 89 |
+
|
| 90 |
+
# Generate community colors
|
| 91 |
+
community_ids = list(set(self.node_to_community.values()))
|
| 92 |
+
colors = generate_colors(max(len(community_ids), 1))
|
| 93 |
+
community_colors = {cid: colors[i % len(colors)] for i, cid in enumerate(community_ids)}
|
| 94 |
+
|
| 95 |
+
# Count nodes by type
|
| 96 |
+
node_type_counts = {"document": 0, "chunk": 0, "entity": 0}
|
| 97 |
+
for node in nodes_data:
|
| 98 |
+
nt = node.get("node_type", "chunk")
|
| 99 |
+
node_type_counts[nt] = node_type_counts.get(nt, 0) + 1
|
| 100 |
+
|
| 101 |
+
# Build vis.js nodes
|
| 102 |
+
vis_nodes = []
|
| 103 |
+
for node in nodes_data:
|
| 104 |
+
node_id = str(node["id"])
|
| 105 |
+
node_type = node.get("node_type", "chunk")
|
| 106 |
+
community_id = self.node_to_community.get(node["id"])
|
| 107 |
+
|
| 108 |
+
config = NODE_TYPE_CONFIG.get(node_type, NODE_TYPE_CONFIG["chunk"])
|
| 109 |
+
|
| 110 |
+
# Determine color: community color or default by type
|
| 111 |
+
if community_id and community_id in community_colors:
|
| 112 |
+
node_color = community_colors[community_id]
|
| 113 |
+
else:
|
| 114 |
+
node_color = config["color"]
|
| 115 |
+
|
| 116 |
+
# Calculate size based on degree/pagerank
|
| 117 |
+
base_size = config["size"]
|
| 118 |
+
degree = node.get("degree", 0)
|
| 119 |
+
pagerank = node.get("pagerank", 0)
|
| 120 |
+
size = base_size + min(degree * 0.5, 15) + min(pagerank * 300, 10)
|
| 121 |
+
|
| 122 |
+
label = self._get_label(node) if show_labels else ""
|
| 123 |
+
tooltip = self._get_tooltip(node)
|
| 124 |
+
|
| 125 |
+
vis_nodes.append({
|
| 126 |
+
"id": node_id,
|
| 127 |
+
"label": label,
|
| 128 |
+
"title": tooltip,
|
| 129 |
+
"shape": config["shape"],
|
| 130 |
+
"color": node_color,
|
| 131 |
+
"size": int(size),
|
| 132 |
+
"font": {"color": "#ffffff", "size": 10},
|
| 133 |
+
"borderWidth": 1,
|
| 134 |
+
"borderWidthSelected": 3,
|
| 135 |
+
})
|
| 136 |
+
|
| 137 |
+
# Build vis.js edges
|
| 138 |
+
vis_edges = []
|
| 139 |
+
node_ids_set = {str(n["id"]) for n in nodes_data}
|
| 140 |
+
|
| 141 |
+
for edge in edges_data:
|
| 142 |
+
source = str(edge["source"])
|
| 143 |
+
target = str(edge["target"])
|
| 144 |
+
|
| 145 |
+
# Only include edges between existing nodes
|
| 146 |
+
if source not in node_ids_set or target not in node_ids_set:
|
| 147 |
+
continue
|
| 148 |
+
if source == target:
|
| 149 |
+
continue
|
| 150 |
+
|
| 151 |
+
edge_type = edge.get("edge_type", "relates_to")
|
| 152 |
+
edge_color = EDGE_TYPE_COLORS.get(edge_type, "#888888")
|
| 153 |
+
|
| 154 |
+
vis_edges.append({
|
| 155 |
+
"from": source,
|
| 156 |
+
"to": target,
|
| 157 |
+
"color": edge_color,
|
| 158 |
+
"width": max(1, min(edge.get("weight", 1) * 0.5, 5)),
|
| 159 |
+
"arrows": {"to": {"enabled": edge_type in ["contains", "mentions"], "scaleFactor": 0.5}},
|
| 160 |
+
"smooth": {"type": "continuous", "roundness": 0.2},
|
| 161 |
+
})
|
| 162 |
+
|
| 163 |
+
# Generate HTML
|
| 164 |
+
html = self._generate_html(vis_nodes, vis_edges, physics_enabled, community_colors, node_type_counts)
|
| 165 |
+
|
| 166 |
+
output_path = Path(output_path)
|
| 167 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 168 |
+
output_path.write_text(html, encoding="utf-8")
|
| 169 |
+
|
| 170 |
+
return {
|
| 171 |
+
"nodes": len(vis_nodes),
|
| 172 |
+
"edges": len(vis_edges),
|
| 173 |
+
"communities": len(community_colors),
|
| 174 |
+
"path": str(output_path),
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
def _prepare_data(self, max_nodes: int, min_degree: int) -> tuple:
|
| 178 |
+
"""Prepare and sample graph data."""
|
| 179 |
+
if self.graph is None or self.graph.number_of_nodes() == 0:
|
| 180 |
+
return [], []
|
| 181 |
+
|
| 182 |
+
# Get node degrees
|
| 183 |
+
degrees = dict(self.graph.degree())
|
| 184 |
+
|
| 185 |
+
# Filter by minimum degree
|
| 186 |
+
valid_nodes = {n for n, d in degrees.items() if d >= min_degree}
|
| 187 |
+
|
| 188 |
+
if not valid_nodes:
|
| 189 |
+
valid_nodes = set(self.graph.nodes())
|
| 190 |
+
|
| 191 |
+
# Sample by PageRank if too many nodes
|
| 192 |
+
if len(valid_nodes) > max_nodes:
|
| 193 |
+
pageranks = {}
|
| 194 |
+
for n in valid_nodes:
|
| 195 |
+
pr = self.graph.nodes[n].get("pagerank", 0) if n in self.graph.nodes else 0
|
| 196 |
+
pageranks[n] = pr + degrees.get(n, 0) * 0.01 # Combine with degree
|
| 197 |
+
|
| 198 |
+
sorted_nodes = sorted(pageranks.items(), key=lambda x: -x[1])
|
| 199 |
+
valid_nodes = {n for n, _ in sorted_nodes[:max_nodes]}
|
| 200 |
+
|
| 201 |
+
# Build nodes data
|
| 202 |
+
nodes_data = []
|
| 203 |
+
for nid in valid_nodes:
|
| 204 |
+
if nid not in self.graph.nodes:
|
| 205 |
+
continue
|
| 206 |
+
data = dict(self.graph.nodes[nid])
|
| 207 |
+
data["id"] = nid
|
| 208 |
+
data["degree"] = degrees.get(nid, 0)
|
| 209 |
+
nodes_data.append(data)
|
| 210 |
+
|
| 211 |
+
# Build edges data (only between valid nodes)
|
| 212 |
+
edges_data = []
|
| 213 |
+
for u, v, data in self.graph.edges(data=True):
|
| 214 |
+
if u in valid_nodes and v in valid_nodes:
|
| 215 |
+
edge = dict(data)
|
| 216 |
+
edge["source"] = u
|
| 217 |
+
edge["target"] = v
|
| 218 |
+
edges_data.append(edge)
|
| 219 |
+
|
| 220 |
+
return nodes_data, edges_data
|
| 221 |
+
|
| 222 |
+
def _get_label(self, node: Dict) -> str:
|
| 223 |
+
"""Get display label for node."""
|
| 224 |
+
node_type = node.get("node_type", "")
|
| 225 |
+
|
| 226 |
+
if node_type == "entity":
|
| 227 |
+
value = node.get("value", str(node["id"]))
|
| 228 |
+
return value[:25] if len(value) > 25 else value
|
| 229 |
+
elif node_type == "document":
|
| 230 |
+
title = node.get("title", str(node["id"]))
|
| 231 |
+
return title[:30] if len(title) > 30 else title
|
| 232 |
+
elif node_type == "chunk":
|
| 233 |
+
title = node.get("title", "")
|
| 234 |
+
if title:
|
| 235 |
+
return title[:20]
|
| 236 |
+
return f"Chunk"
|
| 237 |
+
|
| 238 |
+
return str(node["id"])[:20]
|
| 239 |
+
|
| 240 |
+
def _get_tooltip(self, node: Dict) -> str:
|
| 241 |
+
"""Get tooltip HTML for node."""
|
| 242 |
+
node_type = node.get("node_type", "unknown")
|
| 243 |
+
parts = [f"<b>{node_type.upper()}</b><br>"]
|
| 244 |
+
parts.append(f"ID: {node['id']}<br>")
|
| 245 |
+
|
| 246 |
+
if node.get("title"):
|
| 247 |
+
parts.append(f"Title: {node['title'][:100]}<br>")
|
| 248 |
+
if node.get("value"):
|
| 249 |
+
parts.append(f"Value: {node['value'][:100]}<br>")
|
| 250 |
+
if node.get("entity_type"):
|
| 251 |
+
parts.append(f"Type: {node['entity_type']}<br>")
|
| 252 |
+
if node.get("text"):
|
| 253 |
+
text = node['text'][:200].replace('<', '<').replace('>', '>')
|
| 254 |
+
parts.append(f"Text: {text}...<br>")
|
| 255 |
+
|
| 256 |
+
parts.append(f"Degree: {node.get('degree', 0)}<br>")
|
| 257 |
+
|
| 258 |
+
if node.get("pagerank"):
|
| 259 |
+
parts.append(f"PageRank: {node['pagerank']:.6f}<br>")
|
| 260 |
+
|
| 261 |
+
comm_id = self.node_to_community.get(node["id"])
|
| 262 |
+
if comm_id:
|
| 263 |
+
parts.append(f"Community: {comm_id}")
|
| 264 |
+
|
| 265 |
+
return "".join(parts)
|
| 266 |
+
|
| 267 |
+
def _generate_html(
|
| 268 |
+
self,
|
| 269 |
+
nodes: List[Dict],
|
| 270 |
+
edges: List[Dict],
|
| 271 |
+
physics_enabled: bool,
|
| 272 |
+
community_colors: Dict[str, str],
|
| 273 |
+
node_type_counts: Dict[str, int] = None,
|
| 274 |
+
) -> str:
|
| 275 |
+
"""Generate complete HTML visualization."""
|
| 276 |
+
|
| 277 |
+
if node_type_counts is None:
|
| 278 |
+
node_type_counts = {"document": 0, "chunk": 0, "entity": 0}
|
| 279 |
+
|
| 280 |
+
nodes_json = json.dumps(nodes, ensure_ascii=False)
|
| 281 |
+
edges_json = json.dumps(edges, ensure_ascii=False)
|
| 282 |
+
|
| 283 |
+
# Community legend
|
| 284 |
+
legend_items = []
|
| 285 |
+
for i, (comm_id, color) in enumerate(list(community_colors.items())[:15]):
|
| 286 |
+
comm = self.communities.get(comm_id)
|
| 287 |
+
if comm:
|
| 288 |
+
if hasattr(comm, 'summary'):
|
| 289 |
+
label = comm.summary[:35] if comm.summary else comm_id
|
| 290 |
+
elif isinstance(comm, dict):
|
| 291 |
+
label = comm.get('summary', comm_id)[:35]
|
| 292 |
+
else:
|
| 293 |
+
label = str(comm_id)[:35]
|
| 294 |
+
else:
|
| 295 |
+
label = str(comm_id)[:35]
|
| 296 |
+
legend_items.append(
|
| 297 |
+
f'<div class="legend-item">'
|
| 298 |
+
f'<span class="color-box" style="background:{color}"></span>'
|
| 299 |
+
f'<span>{label}</span></div>'
|
| 300 |
+
)
|
| 301 |
+
legend_html = "\n".join(legend_items) if legend_items else "<div>No communities</div>"
|
| 302 |
+
|
| 303 |
+
return f'''<!DOCTYPE html>
|
| 304 |
+
<html>
|
| 305 |
+
<head>
|
| 306 |
+
<meta charset="utf-8">
|
| 307 |
+
<title>GraphRAG Knowledge Graph</title>
|
| 308 |
+
<script src="https://unpkg.com/vis-network@9.1.6/standalone/umd/vis-network.min.js"></script>
|
| 309 |
+
<style>
|
| 310 |
+
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
|
| 311 |
+
body {{
|
| 312 |
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
| 313 |
+
background: #1a1a2e;
|
| 314 |
+
color: #eee;
|
| 315 |
+
}}
|
| 316 |
+
#container {{ display: flex; height: 100vh; }}
|
| 317 |
+
#graph {{
|
| 318 |
+
flex: 1;
|
| 319 |
+
background: #16213e;
|
| 320 |
+
border: 1px solid #0f3460;
|
| 321 |
+
}}
|
| 322 |
+
#sidebar {{
|
| 323 |
+
width: 280px;
|
| 324 |
+
background: #0f3460;
|
| 325 |
+
padding: 15px;
|
| 326 |
+
overflow-y: auto;
|
| 327 |
+
border-left: 1px solid #1a1a2e;
|
| 328 |
+
}}
|
| 329 |
+
h1 {{ font-size: 16px; margin-bottom: 15px; color: #e94560; }}
|
| 330 |
+
h2 {{ font-size: 13px; margin: 15px 0 8px; color: #e94560; }}
|
| 331 |
+
.stats {{
|
| 332 |
+
background: #1a1a2e;
|
| 333 |
+
padding: 10px;
|
| 334 |
+
border-radius: 5px;
|
| 335 |
+
margin-bottom: 15px;
|
| 336 |
+
}}
|
| 337 |
+
.stats div {{ margin: 4px 0; font-size: 12px; }}
|
| 338 |
+
.stats b {{ color: #4a90d9; }}
|
| 339 |
+
.controls {{ margin-bottom: 15px; }}
|
| 340 |
+
.controls input, .controls select, .controls button {{
|
| 341 |
+
width: 100%;
|
| 342 |
+
padding: 8px;
|
| 343 |
+
margin: 4px 0;
|
| 344 |
+
border: none;
|
| 345 |
+
border-radius: 4px;
|
| 346 |
+
background: #1a1a2e;
|
| 347 |
+
color: #eee;
|
| 348 |
+
font-size: 12px;
|
| 349 |
+
}}
|
| 350 |
+
.controls input:focus, .controls select:focus {{
|
| 351 |
+
outline: 1px solid #4a90d9;
|
| 352 |
+
}}
|
| 353 |
+
.controls button {{
|
| 354 |
+
background: #e94560;
|
| 355 |
+
cursor: pointer;
|
| 356 |
+
font-weight: bold;
|
| 357 |
+
}}
|
| 358 |
+
.controls button:hover {{ background: #c73e54; }}
|
| 359 |
+
.legend {{
|
| 360 |
+
max-height: 250px;
|
| 361 |
+
overflow-y: auto;
|
| 362 |
+
background: #1a1a2e;
|
| 363 |
+
padding: 8px;
|
| 364 |
+
border-radius: 5px;
|
| 365 |
+
}}
|
| 366 |
+
.legend-item {{
|
| 367 |
+
display: flex;
|
| 368 |
+
align-items: center;
|
| 369 |
+
padding: 3px 0;
|
| 370 |
+
font-size: 11px;
|
| 371 |
+
}}
|
| 372 |
+
.color-box {{
|
| 373 |
+
width: 12px;
|
| 374 |
+
height: 12px;
|
| 375 |
+
margin-right: 8px;
|
| 376 |
+
border-radius: 2px;
|
| 377 |
+
flex-shrink: 0;
|
| 378 |
+
}}
|
| 379 |
+
#node-info {{
|
| 380 |
+
background: #1a1a2e;
|
| 381 |
+
padding: 10px;
|
| 382 |
+
border-radius: 5px;
|
| 383 |
+
font-size: 11px;
|
| 384 |
+
max-height: 180px;
|
| 385 |
+
overflow-y: auto;
|
| 386 |
+
line-height: 1.4;
|
| 387 |
+
}}
|
| 388 |
+
#node-info b {{ color: #e94560; }}
|
| 389 |
+
#loading {{
|
| 390 |
+
position: absolute;
|
| 391 |
+
top: 50%;
|
| 392 |
+
left: 50%;
|
| 393 |
+
transform: translate(-50%, -50%);
|
| 394 |
+
background: #0f3460;
|
| 395 |
+
padding: 20px 40px;
|
| 396 |
+
border-radius: 8px;
|
| 397 |
+
z-index: 1000;
|
| 398 |
+
}}
|
| 399 |
+
.type-legend {{
|
| 400 |
+
display: flex;
|
| 401 |
+
gap: 10px;
|
| 402 |
+
margin-bottom: 10px;
|
| 403 |
+
font-size: 11px;
|
| 404 |
+
}}
|
| 405 |
+
.type-item {{
|
| 406 |
+
display: flex;
|
| 407 |
+
align-items: center;
|
| 408 |
+
gap: 4px;
|
| 409 |
+
}}
|
| 410 |
+
.type-icon {{
|
| 411 |
+
width: 10px;
|
| 412 |
+
height: 10px;
|
| 413 |
+
border-radius: 50%;
|
| 414 |
+
}}
|
| 415 |
+
</style>
|
| 416 |
+
</head>
|
| 417 |
+
<body>
|
| 418 |
+
<div id="container">
|
| 419 |
+
<div id="graph">
|
| 420 |
+
<div id="loading">⏳ Loading graph...</div>
|
| 421 |
+
</div>
|
| 422 |
+
<div id="sidebar">
|
| 423 |
+
<h1>🔗 GraphRAG Visualization</h1>
|
| 424 |
+
|
| 425 |
+
<div class="stats">
|
| 426 |
+
<div>📊 Total Nodes: <b>{len(nodes)}</b></div>
|
| 427 |
+
<div style="padding-left:12px">📄 Documents: <b>{node_type_counts.get('document', 0)}</b></div>
|
| 428 |
+
<div style="padding-left:12px">📝 Chunks: <b>{node_type_counts.get('chunk', 0)}</b></div>
|
| 429 |
+
<div style="padding-left:12px">🏷️ Entities: <b>{node_type_counts.get('entity', 0)}</b></div>
|
| 430 |
+
<div>🔗 Edges: <b>{len(edges)}</b></div>
|
| 431 |
+
<div>🏘️ Communities: <b>{len(community_colors)}</b></div>
|
| 432 |
+
</div>
|
| 433 |
+
|
| 434 |
+
<div class="type-legend">
|
| 435 |
+
<div class="type-item"><div class="type-icon" style="background:#e94560"></div>Document</div>
|
| 436 |
+
<div class="type-item"><div class="type-icon" style="background:#4a90d9"></div>Chunk</div>
|
| 437 |
+
<div class="type-item"><div class="type-icon" style="background:#4ad94a"></div>Entity</div>
|
| 438 |
+
</div>
|
| 439 |
+
|
| 440 |
+
<div class="controls">
|
| 441 |
+
<input type="text" id="search" placeholder="🔍 Search nodes...">
|
| 442 |
+
<select id="filter-type">
|
| 443 |
+
<option value="">All node types</option>
|
| 444 |
+
<option value="document">📄 Documents</option>
|
| 445 |
+
<option value="chunk">📝 Chunks</option>
|
| 446 |
+
<option value="entity">🏷️ Entities</option>
|
| 447 |
+
</select>
|
| 448 |
+
<button onclick="resetView()">🔄 Reset View</button>
|
| 449 |
+
<button onclick="togglePhysics()">⚡ Toggle Physics</button>
|
| 450 |
+
<button onclick="fitGraph()">🎯 Fit to Screen</button>
|
| 451 |
+
</div>
|
| 452 |
+
|
| 453 |
+
<h2>Communities</h2>
|
| 454 |
+
<div class="legend">{legend_html}</div>
|
| 455 |
+
|
| 456 |
+
<h2>Selected Node</h2>
|
| 457 |
+
<div id="node-info">Click a node to see details</div>
|
| 458 |
+
</div>
|
| 459 |
+
</div>
|
| 460 |
+
|
| 461 |
+
<script>
|
| 462 |
+
// Parse data
|
| 463 |
+
const nodesArray = {nodes_json};
|
| 464 |
+
const edgesArray = {edges_json};
|
| 465 |
+
|
| 466 |
+
console.log('Nodes:', nodesArray.length);
|
| 467 |
+
console.log('Edges:', edgesArray.length);
|
| 468 |
+
|
| 469 |
+
// Create DataSets
|
| 470 |
+
const nodes = new vis.DataSet(nodesArray);
|
| 471 |
+
const edges = new vis.DataSet(edgesArray);
|
| 472 |
+
|
| 473 |
+
// Get container
|
| 474 |
+
const container = document.getElementById('graph');
|
| 475 |
+
|
| 476 |
+
// Network data
|
| 477 |
+
const data = {{ nodes: nodes, edges: edges }};
|
| 478 |
+
|
| 479 |
+
// Options
|
| 480 |
+
const options = {{
|
| 481 |
+
nodes: {{
|
| 482 |
+
font: {{
|
| 483 |
+
color: '#ffffff',
|
| 484 |
+
size: 10,
|
| 485 |
+
face: 'arial'
|
| 486 |
+
}},
|
| 487 |
+
borderWidth: 1,
|
| 488 |
+
borderWidthSelected: 3,
|
| 489 |
+
chosen: true,
|
| 490 |
+
}},
|
| 491 |
+
edges: {{
|
| 492 |
+
smooth: {{
|
| 493 |
+
enabled: true,
|
| 494 |
+
type: 'continuous',
|
| 495 |
+
roundness: 0.2
|
| 496 |
+
}},
|
| 497 |
+
selectionWidth: 2,
|
| 498 |
+
hoverWidth: 1.5,
|
| 499 |
+
}},
|
| 500 |
+
physics: {{
|
| 501 |
+
enabled: {'true' if physics_enabled else 'false'},
|
| 502 |
+
solver: 'barnesHut',
|
| 503 |
+
barnesHut: {{
|
| 504 |
+
gravitationalConstant: -3000,
|
| 505 |
+
centralGravity: 0.3,
|
| 506 |
+
springLength: 120,
|
| 507 |
+
springConstant: 0.04,
|
| 508 |
+
damping: 0.09,
|
| 509 |
+
avoidOverlap: 0.1
|
| 510 |
+
}},
|
| 511 |
+
stabilization: {{
|
| 512 |
+
enabled: true,
|
| 513 |
+
iterations: 150,
|
| 514 |
+
updateInterval: 25,
|
| 515 |
+
fit: true
|
| 516 |
+
}},
|
| 517 |
+
maxVelocity: 50,
|
| 518 |
+
minVelocity: 0.1,
|
| 519 |
+
}},
|
| 520 |
+
interaction: {{
|
| 521 |
+
hover: true,
|
| 522 |
+
tooltipDelay: 200,
|
| 523 |
+
hideEdgesOnDrag: true,
|
| 524 |
+
hideEdgesOnZoom: true,
|
| 525 |
+
multiselect: true,
|
| 526 |
+
navigationButtons: true,
|
| 527 |
+
keyboard: true,
|
| 528 |
+
}},
|
| 529 |
+
layout: {{
|
| 530 |
+
improvedLayout: true,
|
| 531 |
+
randomSeed: 42,
|
| 532 |
+
}},
|
| 533 |
+
}};
|
| 534 |
+
|
| 535 |
+
// Create network
|
| 536 |
+
const network = new vis.Network(container, data, options);
|
| 537 |
+
|
| 538 |
+
// Hide loading helper
|
| 539 |
+
function hideLoading() {{
|
| 540 |
+
const el = document.getElementById('loading');
|
| 541 |
+
if (el) el.style.display = 'none';
|
| 542 |
+
}}
|
| 543 |
+
|
| 544 |
+
function updateLoading(text) {{
|
| 545 |
+
const el = document.getElementById('loading');
|
| 546 |
+
if (el) el.textContent = text;
|
| 547 |
+
}}
|
| 548 |
+
|
| 549 |
+
// Hide loading on stabilization done
|
| 550 |
+
network.on('stabilizationIterationsDone', function() {{
|
| 551 |
+
hideLoading();
|
| 552 |
+
console.log('Stabilization complete');
|
| 553 |
+
}});
|
| 554 |
+
|
| 555 |
+
network.on('stabilizationProgress', function(params) {{
|
| 556 |
+
const progress = Math.round(params.iterations / params.total * 100);
|
| 557 |
+
updateLoading('⏳ Stabilizing: ' + progress + '%');
|
| 558 |
+
}});
|
| 559 |
+
|
| 560 |
+
// If no physics, hide loading immediately
|
| 561 |
+
if (!{'true' if physics_enabled else 'false'}) {{
|
| 562 |
+
setTimeout(hideLoading, 500);
|
| 563 |
+
}}
|
| 564 |
+
|
| 565 |
+
// Store original node data for filtering
|
| 566 |
+
const originalNodes = nodesArray.slice();
|
| 567 |
+
|
| 568 |
+
// Search functionality
|
| 569 |
+
document.getElementById('search').addEventListener('input', function(e) {{
|
| 570 |
+
const query = e.target.value.toLowerCase().trim();
|
| 571 |
+
|
| 572 |
+
if (!query) {{
|
| 573 |
+
// Show all nodes
|
| 574 |
+
originalNodes.forEach(n => {{
|
| 575 |
+
nodes.update({{id: n.id, hidden: false}});
|
| 576 |
+
}});
|
| 577 |
+
return;
|
| 578 |
+
}}
|
| 579 |
+
|
| 580 |
+
originalNodes.forEach(n => {{
|
| 581 |
+
const label = (n.label || '').toLowerCase();
|
| 582 |
+
const title = (n.title || '').toLowerCase();
|
| 583 |
+
const id = (n.id || '').toLowerCase();
|
| 584 |
+
const match = label.includes(query) || title.includes(query) || id.includes(query);
|
| 585 |
+
nodes.update({{id: n.id, hidden: !match}});
|
| 586 |
+
}});
|
| 587 |
+
}});
|
| 588 |
+
|
| 589 |
+
// Type filter
|
| 590 |
+
document.getElementById('filter-type').addEventListener('change', function(e) {{
|
| 591 |
+
const type = e.target.value;
|
| 592 |
+
|
| 593 |
+
originalNodes.forEach(n => {{
|
| 594 |
+
// Get node type from original data
|
| 595 |
+
const nodeType = nodesArray.find(x => x.id === n.id);
|
| 596 |
+
const nType = nodeType ? (nodeType.title || '').split('<br>')[0].replace('<b>', '').replace('</b>', '').toLowerCase() : '';
|
| 597 |
+
|
| 598 |
+
if (!type) {{
|
| 599 |
+
nodes.update({{id: n.id, hidden: false}});
|
| 600 |
+
}} else {{
|
| 601 |
+
// Check shape to determine type
|
| 602 |
+
const shape = nodeType ? nodeType.shape : '';
|
| 603 |
+
let isType = false;
|
| 604 |
+
if (type === 'document' && shape === 'diamond') isType = true;
|
| 605 |
+
if (type === 'chunk' && shape === 'dot') isType = true;
|
| 606 |
+
if (type === 'entity' && shape === 'triangle') isType = true;
|
| 607 |
+
nodes.update({{id: n.id, hidden: !isType}});
|
| 608 |
+
}}
|
| 609 |
+
}});
|
| 610 |
+
}});
|
| 611 |
+
|
| 612 |
+
// Node click handler
|
| 613 |
+
network.on('click', function(params) {{
|
| 614 |
+
if (params.nodes.length > 0) {{
|
| 615 |
+
const nodeId = params.nodes[0];
|
| 616 |
+
const node = nodes.get(nodeId);
|
| 617 |
+
if (node) {{
|
| 618 |
+
document.getElementById('node-info').innerHTML = node.title || 'No details available';
|
| 619 |
+
}}
|
| 620 |
+
}}
|
| 621 |
+
}});
|
| 622 |
+
|
| 623 |
+
// Double click to focus
|
| 624 |
+
network.on('doubleClick', function(params) {{
|
| 625 |
+
if (params.nodes.length > 0) {{
|
| 626 |
+
network.focus(params.nodes[0], {{
|
| 627 |
+
scale: 1.5,
|
| 628 |
+
animation: true
|
| 629 |
+
}});
|
| 630 |
+
}}
|
| 631 |
+
}});
|
| 632 |
+
|
| 633 |
+
function resetView() {{
|
| 634 |
+
// Show all nodes
|
| 635 |
+
originalNodes.forEach(n => {{
|
| 636 |
+
nodes.update({{id: n.id, hidden: false}});
|
| 637 |
+
}});
|
| 638 |
+
document.getElementById('search').value = '';
|
| 639 |
+
document.getElementById('filter-type').value = '';
|
| 640 |
+
network.fit({{animation: true}});
|
| 641 |
+
}}
|
| 642 |
+
|
| 643 |
+
let physicsEnabled = {'true' if physics_enabled else 'false'};
|
| 644 |
+
function togglePhysics() {{
|
| 645 |
+
physicsEnabled = !physicsEnabled;
|
| 646 |
+
network.setOptions({{ physics: {{ enabled: physicsEnabled }} }});
|
| 647 |
+
}}
|
| 648 |
+
|
| 649 |
+
function fitGraph() {{
|
| 650 |
+
network.fit({{animation: true}});
|
| 651 |
+
}}
|
| 652 |
+
|
| 653 |
+
// Initial fit after short delay
|
| 654 |
+
setTimeout(() => {{
|
| 655 |
+
network.fit();
|
| 656 |
+
}}, 100);
|
| 657 |
+
</script>
|
| 658 |
+
</body>
|
| 659 |
+
</html>'''
|
| 660 |
+
|
| 661 |
+
|
| 662 |
+
def visualize_graph(
|
| 663 |
+
graph,
|
| 664 |
+
communities: Dict = None,
|
| 665 |
+
output_path: str = "graph_visualization.html",
|
| 666 |
+
max_nodes: int = 5000,
|
| 667 |
+
min_degree: int = 0,
|
| 668 |
+
) -> Dict:
|
| 669 |
+
"""
|
| 670 |
+
Convenience function to create graph visualization.
|
| 671 |
+
|
| 672 |
+
Args:
|
| 673 |
+
graph: NetworkX graph
|
| 674 |
+
communities: Community dict
|
| 675 |
+
output_path: Output HTML file path
|
| 676 |
+
max_nodes: Maximum nodes to display
|
| 677 |
+
min_degree: Minimum degree filter
|
| 678 |
+
|
| 679 |
+
Returns:
|
| 680 |
+
Dict with visualization stats
|
| 681 |
+
"""
|
| 682 |
+
viz = GraphVisualizer(graph, communities)
|
| 683 |
+
return viz.create_visualization(
|
| 684 |
+
Path(output_path),
|
| 685 |
+
max_nodes=max_nodes,
|
| 686 |
+
min_degree=min_degree,
|
| 687 |
+
)
|
| 688 |
+
|
| 689 |
+
|
| 690 |
+
# Test function
|
| 691 |
+
if __name__ == "__main__":
|
| 692 |
+
import networkx as nx
|
| 693 |
+
|
| 694 |
+
# Create test graph
|
| 695 |
+
G = nx.DiGraph()
|
| 696 |
+
|
| 697 |
+
# Add test nodes
|
| 698 |
+
for i in range(10):
|
| 699 |
+
G.add_node(f"doc_{i}", node_type="document", title=f"Document {i}", pagerank=0.05)
|
| 700 |
+
|
| 701 |
+
for i in range(25):
|
| 702 |
+
G.add_node(f"chunk_{i}", node_type="chunk", title=f"Chunk {i}", text=f"Sample text for chunk {i}")
|
| 703 |
+
|
| 704 |
+
for i in range(15):
|
| 705 |
+
G.add_node(f"entity_{i}", node_type="entity", value=f"Entity {i}", entity_type="PERSON")
|
| 706 |
+
|
| 707 |
+
# Add test edges
|
| 708 |
+
for i in range(25):
|
| 709 |
+
G.add_edge(f"doc_{i % 10}", f"chunk_{i}", edge_type="contains", weight=1.0)
|
| 710 |
+
|
| 711 |
+
for i in range(15):
|
| 712 |
+
G.add_edge(f"chunk_{i % 25}", f"entity_{i}", edge_type="mentions", weight=0.8)
|
| 713 |
+
|
| 714 |
+
for i in range(10):
|
| 715 |
+
G.add_edge(f"entity_{i}", f"entity_{(i + 1) % 15}", edge_type="co_occurs", weight=0.5)
|
| 716 |
+
|
| 717 |
+
# Visualize
|
| 718 |
+
result = visualize_graph(G, output_path="test_graph.html")
|
| 719 |
+
print(f"Created visualization: {result}")
|
prebuild.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""GraphRAG v4 — Pre-build graph and index for deployment.
|
| 3 |
+
|
| 4 |
+
This script runs the full pipeline ONCE on a machine with GPU/RAM,
|
| 5 |
+
producing a self-contained artifact directory that the Gradio demo
|
| 6 |
+
loads instantly at startup — no model loading, no PDF parsing,
|
| 7 |
+
no PageRank, no Leiden needed at inference time.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
# Build from PDFs:
|
| 11 |
+
python prebuild.py --input ./pdfs --output ./data/prebuilt
|
| 12 |
+
|
| 13 |
+
# Then deploy: copy ./data/prebuilt/ into the Docker image / HF Space.
|
| 14 |
+
# The demo will detect it and skip all heavy initialization.
|
| 15 |
+
|
| 16 |
+
Output structure:
|
| 17 |
+
data/prebuilt/
|
| 18 |
+
├── corpus/ # Raw corpus (documents, chunks, entities, relations)
|
| 19 |
+
│ ├── documents.jsonl
|
| 20 |
+
│ ├── chunks.jsonl
|
| 21 |
+
│ ├── entities.jsonl
|
| 22 |
+
│ ├── relations.jsonl
|
| 23 |
+
│ ├── chunk_entities.json
|
| 24 |
+
│ └── stats.json
|
| 25 |
+
├── graph/ # Serialized NetworkX graph + communities
|
| 26 |
+
│ ├── graph_nodes.jsonl
|
| 27 |
+
│ ├── graph_edges.jsonl
|
| 28 |
+
│ └── communities.json
|
| 29 |
+
└── index/ # FAISS + sparse index + community embeddings
|
| 30 |
+
├── dense_index.faiss
|
| 31 |
+
├── dense_index.meta.pkl
|
| 32 |
+
├── sparse_index.pkl
|
| 33 |
+
└── community_embeddings.pkl
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
import argparse
|
| 37 |
+
import json
|
| 38 |
+
import sys
|
| 39 |
+
import time
|
| 40 |
+
from pathlib import Path
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def prebuild(input_path: Path, output_dir: Path, max_chunk_tokens: int = 384):
|
| 44 |
+
start = time.time()
|
| 45 |
+
|
| 46 |
+
corpus_dir = output_dir / "corpus"
|
| 47 |
+
graph_dir = output_dir / "graph"
|
| 48 |
+
index_dir = output_dir / "index"
|
| 49 |
+
|
| 50 |
+
for d in [corpus_dir, graph_dir, index_dir]:
|
| 51 |
+
d.mkdir(parents=True, exist_ok=True)
|
| 52 |
+
|
| 53 |
+
# ── Step 1: Build corpus ─────────────────────────────────────────
|
| 54 |
+
print("=" * 60)
|
| 55 |
+
print("STEP 1: Building corpus from PDFs")
|
| 56 |
+
print("=" * 60)
|
| 57 |
+
|
| 58 |
+
from graphrag_v4.corpus_builder import build_corpus
|
| 59 |
+
build_corpus(input_path=input_path, output_dir=corpus_dir, max_chunk_tokens=max_chunk_tokens)
|
| 60 |
+
|
| 61 |
+
stats_file = corpus_dir / "stats.json"
|
| 62 |
+
if not stats_file.exists():
|
| 63 |
+
print("ERROR: Corpus building failed.")
|
| 64 |
+
sys.exit(1)
|
| 65 |
+
|
| 66 |
+
stats = json.loads(stats_file.read_text())
|
| 67 |
+
print(f" Documents: {stats['document_count']}")
|
| 68 |
+
print(f" Chunks: {stats['total_chunks']}")
|
| 69 |
+
print(f" Entities: {stats['total_entities']}")
|
| 70 |
+
print(f" Relations: {stats['total_relations']}")
|
| 71 |
+
|
| 72 |
+
# ── Step 2: Build knowledge graph ────────────────────────────────
|
| 73 |
+
print(f"\n{'=' * 60}")
|
| 74 |
+
print("STEP 2: Building knowledge graph")
|
| 75 |
+
print("=" * 60)
|
| 76 |
+
|
| 77 |
+
from graphrag_v4.graph_builder import KnowledgeGraphBuilder
|
| 78 |
+
builder = KnowledgeGraphBuilder()
|
| 79 |
+
builder.load_corpus(corpus_dir)
|
| 80 |
+
print(f" Loaded: {builder.graph.number_of_nodes()} nodes, {builder.graph.number_of_edges()} edges")
|
| 81 |
+
|
| 82 |
+
builder.build_cooccurrence_edges()
|
| 83 |
+
print(f" After co-occurrence: {builder.graph.number_of_edges()} edges")
|
| 84 |
+
|
| 85 |
+
cross_doc = builder.build_cross_document_edges()
|
| 86 |
+
print(f" Cross-doc edges: {cross_doc}")
|
| 87 |
+
|
| 88 |
+
builder.compute_pagerank()
|
| 89 |
+
builder.detect_communities(n_levels=2)
|
| 90 |
+
print(f" Communities: {len(builder.communities)}")
|
| 91 |
+
|
| 92 |
+
builder.generate_community_summaries()
|
| 93 |
+
|
| 94 |
+
# Save graph
|
| 95 |
+
builder.save(graph_dir)
|
| 96 |
+
print(f" Saved graph to {graph_dir}")
|
| 97 |
+
|
| 98 |
+
# ── Step 3: Build search index ───────────────────────────────────
|
| 99 |
+
print(f"\n{'=' * 60}")
|
| 100 |
+
print("STEP 3: Building search index")
|
| 101 |
+
print("=" * 60)
|
| 102 |
+
|
| 103 |
+
from graphrag_v4.retriever import HybridRetriever
|
| 104 |
+
retriever = HybridRetriever(graph=builder.graph, communities=builder.communities)
|
| 105 |
+
|
| 106 |
+
chunks = []
|
| 107 |
+
chunks_file = corpus_dir / "chunks.jsonl"
|
| 108 |
+
with open(chunks_file, "r", encoding="utf-8") as f:
|
| 109 |
+
for line in f:
|
| 110 |
+
chunks.append(json.loads(line))
|
| 111 |
+
|
| 112 |
+
print(f" Indexing {len(chunks)} chunks...")
|
| 113 |
+
retriever.index_chunks(chunks)
|
| 114 |
+
retriever.index_communities(builder.communities)
|
| 115 |
+
|
| 116 |
+
retriever.save(index_dir)
|
| 117 |
+
print(f" Saved index to {index_dir}")
|
| 118 |
+
|
| 119 |
+
# ── Summary ──────────────────────────────────────────────────────
|
| 120 |
+
elapsed = time.time() - start
|
| 121 |
+
kg_stats = builder.get_stats()
|
| 122 |
+
|
| 123 |
+
print(f"\n{'=' * 60}")
|
| 124 |
+
print(f"PRE-BUILD COMPLETE in {elapsed:.1f}s")
|
| 125 |
+
print(f"{'=' * 60}")
|
| 126 |
+
print(f" Output: {output_dir}")
|
| 127 |
+
print(f" Nodes: {kg_stats['total_nodes']}, Edges: {kg_stats['total_edges']}")
|
| 128 |
+
print(f" Communities: {kg_stats['communities']}")
|
| 129 |
+
print(f" Cross-doc entities: {kg_stats.get('cross_doc_entities', 0)}")
|
| 130 |
+
print()
|
| 131 |
+
print(f" To deploy, copy {output_dir}/ into your Docker image or HF Space.")
|
| 132 |
+
print(f" The demo will auto-detect it and load instantly.")
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def main():
|
| 136 |
+
parser = argparse.ArgumentParser(
|
| 137 |
+
description="Pre-build GraphRAG graph + index for deployment",
|
| 138 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 139 |
+
epilog="""
|
| 140 |
+
Examples:
|
| 141 |
+
python prebuild.py --input ./pdfs --output ./data/prebuilt
|
| 142 |
+
python prebuild.py --input ./single_doc.pdf --output ./data/prebuilt --max-chunk-tokens 512
|
| 143 |
+
""",
|
| 144 |
+
)
|
| 145 |
+
parser.add_argument("--input", "-i", required=True, help="Input PDF file or directory")
|
| 146 |
+
parser.add_argument("--output", "-o", required=True, help="Output directory for pre-built artifacts")
|
| 147 |
+
parser.add_argument("--max-chunk-tokens", type=int, default=384, help="Max chunk size in tokens")
|
| 148 |
+
args = parser.parse_args()
|
| 149 |
+
|
| 150 |
+
input_path = Path(args.input)
|
| 151 |
+
if not input_path.exists():
|
| 152 |
+
print(f"ERROR: Input not found: {input_path}")
|
| 153 |
+
sys.exit(1)
|
| 154 |
+
|
| 155 |
+
prebuild(
|
| 156 |
+
input_path=input_path,
|
| 157 |
+
output_dir=Path(args.output),
|
| 158 |
+
max_chunk_tokens=args.max_chunk_tokens,
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
if __name__ == "__main__":
|
| 163 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core ML
|
| 2 |
+
torch>=2.0.0
|
| 3 |
+
numpy>=1.24.0
|
| 4 |
+
|
| 5 |
+
# Embeddings
|
| 6 |
+
FlagEmbedding>=1.2.10
|
| 7 |
+
|
| 8 |
+
# Entity extraction
|
| 9 |
+
gliner>=0.2.0
|
| 10 |
+
|
| 11 |
+
# Graph
|
| 12 |
+
networkx>=3.1
|
| 13 |
+
python-igraph>=0.11.0
|
| 14 |
+
leidenalg>=0.10.0
|
| 15 |
+
|
| 16 |
+
# Vector search
|
| 17 |
+
faiss-cpu>=1.7.4
|
| 18 |
+
|
| 19 |
+
# PDF processing
|
| 20 |
+
PyMuPDF>=1.23.0
|
| 21 |
+
|
| 22 |
+
# UI
|
| 23 |
+
gradio>=4.0.0
|
| 24 |
+
|
| 25 |
+
# Utilities
|
| 26 |
+
tqdm>=4.65.0
|
| 27 |
+
python-dotenv>=1.0.0
|
| 28 |
+
|
| 29 |
+
# Optional: LLM integration
|
| 30 |
+
openai>=1.0.0
|