Spaces:
Build error
Build error
| import kuzu | |
| import logging | |
| import sys | |
| import os | |
| import rdflib | |
| from rdflib import Graph, Literal, RDF, URIRef | |
| from rdflib.namespace import FOAF, XSD, Namespace | |
| #import llama_index | |
| from llama_index.graph_stores import KuzuGraphStore | |
| from llama_index import ( | |
| SimpleDirectoryReader, | |
| ServiceContext, | |
| KnowledgeGraphIndex, | |
| ) | |
| from llama_index.readers import SimpleWebPageReader | |
| from llama_index.indices.loading import load_index_from_storage | |
| from llama_index.llms import OpenAI | |
| from IPython.display import Markdown, display | |
| from llama_index.storage.storage_context import StorageContext | |
| from pyvis.network import Network | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| import umap | |
| def make_dir(): | |
| if(not os.path.exists("data")): | |
| os.mkdir('data') | |
| def save_uploadedfile(uploadedfile): | |
| with open(os.path.join("data",uploadedfile.name),"wb") as f: | |
| f.write(uploadedfile.getbuffer()) | |
| def load_index(token, name, base_url): | |
| os.environ["OPENAI_API_KEY"] = token | |
| os.environ["OPENAI_API_BASE"] = base_url | |
| logging.basicConfig(stream=sys.stdout, level=logging.INFO) | |
| db = kuzu.Database(name+"/kg") | |
| graph_store = KuzuGraphStore(db) | |
| llm = OpenAI(temperature=0, model="gpt-3.5-turbo", api_key=token, openai_api_base=base_url) | |
| service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512) | |
| storage_context = StorageContext.from_defaults(graph_store=graph_store,persist_dir=name+"/storage") | |
| index = load_index_from_storage(storage_context=storage_context,service_context=service_context) | |
| return index | |
| def get_index_pdf(token, name, base_url): | |
| documents = SimpleDirectoryReader("./data").load_data() | |
| print(documents) | |
| print(documents) | |
| os.mkdir(name) | |
| os.environ["OPENAI_API_KEY"] = token | |
| os.environ["OPENAI_API_BASE"] = base_url | |
| logging.basicConfig(stream=sys.stdout, level=logging.INFO) | |
| db = kuzu.Database(name+"/kg") | |
| graph_store = KuzuGraphStore(db) | |
| llm = OpenAI(temperature=0, model="gpt-3.5-turbo", api_key=token, openai_api_base=base_url) | |
| service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512) | |
| storage_context = StorageContext.from_defaults(graph_store=graph_store) | |
| index = KnowledgeGraphIndex.from_documents(documents=documents, | |
| max_triplets_per_chunk=2, | |
| storage_context=storage_context, | |
| service_context=service_context, | |
| show_progress=True, | |
| include_embeddings=True) | |
| index.storage_context.persist(name+"/storage") | |
| return index | |
| def get_index(links, token, name, base_url): | |
| os.mkdir(name) | |
| os.environ["OPENAI_API_KEY"] = token | |
| os.environ["OPENAI_API_BASE"] = base_url | |
| logging.basicConfig(stream=sys.stdout, level=logging.INFO) | |
| db = kuzu.Database(name+"/kg") | |
| graph_store = KuzuGraphStore(db) | |
| documents = SimpleWebPageReader(html_to_text=True).load_data( | |
| links | |
| ) | |
| llm = OpenAI(temperature=0, model="gpt-3.5-turbo", api_key=token, openai_api_base=base_url) | |
| service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512) | |
| storage_context = StorageContext.from_defaults(graph_store=graph_store) | |
| # NOTE: can take a while! | |
| index = KnowledgeGraphIndex.from_documents(documents=documents, | |
| max_triplets_per_chunk=2, | |
| storage_context=storage_context, | |
| service_context=service_context, | |
| show_progress=True, | |
| include_embeddings=True) | |
| index.storage_context.persist(name+"/storage") | |
| return index | |
| def get_network_graph(index): | |
| g = index.get_networkx_graph() | |
| net = Network(directed=True) | |
| net.from_nx(g) | |
| # net.show("kuzugraph_draw3.html") | |
| net.save_graph("kuzugraph_draw3.html") | |
| def get_embeddings(index): | |
| embeddings = index.index_struct.to_dict() | |
| embeddings_df = pd.DataFrame.from_dict(embeddings)['embedding_dict'] | |
| embeddings_df = embeddings_df.dropna() | |
| return embeddings_df | |
| def get_visualize_embeddings(embedding_series, n_neighbors=15, min_dist=0.1, n_components=2): | |
| # Convert Series to DataFrame | |
| embedding_df = pd.DataFrame(embedding_series.tolist(), columns=[f'dim_{i+1}' for i in range(len(embedding_series[0]))]) | |
| # Perform UMAP dimensionality reduction | |
| umap_embedded = umap.UMAP( | |
| n_neighbors=n_neighbors, | |
| min_dist=min_dist, | |
| n_components=n_components, | |
| random_state=42, | |
| ).fit_transform(embedding_df.values) | |
| # Plot the UMAP embedding | |
| umap_df = pd.DataFrame(umap_embedded, columns=['UMAP Dimension 1', 'UMAP Dimension 2']) | |
| umap_df['Label'] = embedding_series.index | |
| # Plot the UMAP embedding using Plotly Express | |
| fig = px.scatter(umap_df, x='UMAP Dimension 1', y='UMAP Dimension 2',hover_data=['Label'], title='UMAP Visualization of Embeddings') | |
| return fig | |
| def generate_rdf(index): | |
| g = Graph() | |
| # Define namespace prefixes | |
| EX = Namespace("http://example.com/") | |
| # Iterate over the nodes in the index | |
| for node in index.index_struct.node_dict.values(): | |
| subject = EX[str(node.node_id)] | |
| # Add triples for node properties | |
| g.add((subject, RDF.type, EX["Node"])) | |
| g.add((subject, EX["text"], Literal(node.text))) | |
| # Add triples for node relationships | |
| for relationship in node.relationships: | |
| predicate = EX[relationship.predicate] | |
| object_node = EX[str(relationship.object_id)] | |
| g.add((subject, predicate, object_node)) | |
| return g | |
| def visualize_rdf(rdf_graph): | |
| # Visualize the RDF graph (you can use a library like PyVis or D3.js) | |
| # For simplicity, let's serialize the RDF graph to a string | |
| rdf_string = rdf_graph.serialize(format="turtle").decode("utf-8") | |
| return rdf_string | |
| def query_model(index,user_query): | |
| query_engine = index.as_query_engine( | |
| include_text=True, | |
| response_mode="tree_summarize", | |
| embedding_mode="hybrid", | |
| similarity_top_k=5, | |
| ) | |
| response = query_engine.query(user_query) | |
| return response.response |