Spaces:
Build error
Build error
File size: 6,446 Bytes
bed7d75 f9d2704 c929b50 beaea9f bed7d75 beaea9f bed7d75 beaea9f bed7d75 beaea9f bed7d75 b1a0c4c c84d527 22a8aa1 b1a0c4c bb31fc3 4a32b09 beaea9f 1fead9a beaea9f 1fead9a beaea9f 4a32b09 bb31fc3 1fead9a bb31fc3 1fead9a bb31fc3 4a32b09 beaea9f 1fead9a bed7d75 beaea9f bed7d75 1fead9a bed7d75 beaea9f bed7d75 beaea9f bed7d75 beaea9f bed7d75 beaea9f bed7d75 f9d2704 bed7d75 f9d2704 beaea9f bed7d75 beaea9f 6b06a97 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import kuzu
import logging
import sys
import os
import rdflib
from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import FOAF, XSD, Namespace
#import llama_index
from llama_index.graph_stores import KuzuGraphStore
from llama_index import (
SimpleDirectoryReader,
ServiceContext,
KnowledgeGraphIndex,
)
from llama_index.readers import SimpleWebPageReader
from llama_index.indices.loading import load_index_from_storage
from llama_index.llms import OpenAI
from IPython.display import Markdown, display
from llama_index.storage.storage_context import StorageContext
from pyvis.network import Network
import pandas as pd
import numpy as np
import plotly.express as px
import umap
def make_dir():
if(not os.path.exists("data")):
os.mkdir('data')
def save_uploadedfile(uploadedfile):
with open(os.path.join("data",uploadedfile.name),"wb") as f:
f.write(uploadedfile.getbuffer())
def load_index(token, name, base_url):
os.environ["OPENAI_API_KEY"] = token
os.environ["OPENAI_API_BASE"] = base_url
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
db = kuzu.Database(name+"/kg")
graph_store = KuzuGraphStore(db)
llm = OpenAI(temperature=0, model="gpt-3.5-turbo", api_key=token, openai_api_base=base_url)
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512)
storage_context = StorageContext.from_defaults(graph_store=graph_store,persist_dir=name+"/storage")
index = load_index_from_storage(storage_context=storage_context,service_context=service_context)
return index
def get_index_pdf(token, name, base_url):
documents = SimpleDirectoryReader("./data").load_data()
print(documents)
print(documents)
os.mkdir(name)
os.environ["OPENAI_API_KEY"] = token
os.environ["OPENAI_API_BASE"] = base_url
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
db = kuzu.Database(name+"/kg")
graph_store = KuzuGraphStore(db)
llm = OpenAI(temperature=0, model="gpt-3.5-turbo", api_key=token, openai_api_base=base_url)
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512)
storage_context = StorageContext.from_defaults(graph_store=graph_store)
index = KnowledgeGraphIndex.from_documents(documents=documents,
max_triplets_per_chunk=2,
storage_context=storage_context,
service_context=service_context,
show_progress=True,
include_embeddings=True)
index.storage_context.persist(name+"/storage")
return index
def get_index(links, token, name, base_url):
os.mkdir(name)
os.environ["OPENAI_API_KEY"] = token
os.environ["OPENAI_API_BASE"] = base_url
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
db = kuzu.Database(name+"/kg")
graph_store = KuzuGraphStore(db)
documents = SimpleWebPageReader(html_to_text=True).load_data(
links
)
llm = OpenAI(temperature=0, model="gpt-3.5-turbo", api_key=token, openai_api_base=base_url)
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512)
storage_context = StorageContext.from_defaults(graph_store=graph_store)
# NOTE: can take a while!
index = KnowledgeGraphIndex.from_documents(documents=documents,
max_triplets_per_chunk=2,
storage_context=storage_context,
service_context=service_context,
show_progress=True,
include_embeddings=True)
index.storage_context.persist(name+"/storage")
return index
def get_network_graph(index):
g = index.get_networkx_graph()
net = Network(directed=True)
net.from_nx(g)
# net.show("kuzugraph_draw3.html")
net.save_graph("kuzugraph_draw3.html")
def get_embeddings(index):
embeddings = index.index_struct.to_dict()
embeddings_df = pd.DataFrame.from_dict(embeddings)['embedding_dict']
embeddings_df = embeddings_df.dropna()
return embeddings_df
def get_visualize_embeddings(embedding_series, n_neighbors=15, min_dist=0.1, n_components=2):
# Convert Series to DataFrame
embedding_df = pd.DataFrame(embedding_series.tolist(), columns=[f'dim_{i+1}' for i in range(len(embedding_series[0]))])
# Perform UMAP dimensionality reduction
umap_embedded = umap.UMAP(
n_neighbors=n_neighbors,
min_dist=min_dist,
n_components=n_components,
random_state=42,
).fit_transform(embedding_df.values)
# Plot the UMAP embedding
umap_df = pd.DataFrame(umap_embedded, columns=['UMAP Dimension 1', 'UMAP Dimension 2'])
umap_df['Label'] = embedding_series.index
# Plot the UMAP embedding using Plotly Express
fig = px.scatter(umap_df, x='UMAP Dimension 1', y='UMAP Dimension 2',hover_data=['Label'], title='UMAP Visualization of Embeddings')
return fig
def generate_rdf(index):
g = Graph()
# Define namespace prefixes
EX = Namespace("http://example.com/")
# Iterate over the nodes in the index
for node in index.index_struct.node_dict.values():
subject = EX[str(node.node_id)]
# Add triples for node properties
g.add((subject, RDF.type, EX["Node"]))
g.add((subject, EX["text"], Literal(node.text)))
# Add triples for node relationships
for relationship in node.relationships:
predicate = EX[relationship.predicate]
object_node = EX[str(relationship.object_id)]
g.add((subject, predicate, object_node))
return g
def visualize_rdf(rdf_graph):
# Visualize the RDF graph (you can use a library like PyVis or D3.js)
# For simplicity, let's serialize the RDF graph to a string
rdf_string = rdf_graph.serialize(format="turtle").decode("utf-8")
return rdf_string
def query_model(index,user_query):
query_engine = index.as_query_engine(
include_text=True,
response_mode="tree_summarize",
embedding_mode="hybrid",
similarity_top_k=5,
)
response = query_engine.query(user_query)
return response.response |