Spaces:
No application file
No application file
File size: 5,804 Bytes
88f42a4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | import os
import requests
from dotenv import load_dotenv
from langchain_community.graphs import Neo4jGraph
import streamlit as st
from streamlit.logger import get_logger
from chains import load_embedding_model
from utils import create_constraints, create_vector_index
from PIL import Image
load_dotenv(".env")
url = os.getenv("NEO4J_URI")
username = os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")
ollama_base_url = os.getenv("OLLAMA_BASE_URL")
embedding_model_name = os.getenv("EMBEDDING_MODEL")
# Remapping for Langchain Neo4j integration
os.environ["NEO4J_URL"] = url
logger = get_logger(__name__)
so_api_base_url = "https://api.stackexchange.com/2.3/search/advanced"
embeddings, dimension = load_embedding_model(
embedding_model_name, config={"ollama_base_url": ollama_base_url}, logger=logger
)
# if Neo4j is local, you can go to http://localhost:7474/ to browse the database
neo4j_graph = Neo4jGraph(
url=url, username=username, password=password, refresh_schema=False
)
create_constraints(neo4j_graph)
create_vector_index(neo4j_graph)
def load_so_data(tag: str = "neo4j", page: int = 1) -> None:
parameters = (
f"?pagesize=100&page={page}&order=desc&sort=creation&answers=1&tagged={tag}"
"&site=stackoverflow&filter=!*236eb_eL9rai)MOSNZ-6D3Q6ZKb0buI*IVotWaTb"
)
data = requests.get(so_api_base_url + parameters).json()
insert_so_data(data)
def load_high_score_so_data() -> None:
parameters = (
f"?fromdate=1664150400&order=desc&sort=votes&site=stackoverflow&"
"filter=!.DK56VBPooplF.)bWW5iOX32Fh1lcCkw1b_Y6Zkb7YD8.ZMhrR5.FRRsR6Z1uK8*Z5wPaONvyII"
)
data = requests.get(so_api_base_url + parameters).json()
insert_so_data(data)
def insert_so_data(data: dict) -> None:
# Calculate embedding values for questions and answers
for q in data["items"]:
question_text = q["title"] + "\n" + q["body_markdown"]
q["embedding"] = embeddings.embed_query(question_text)
for a in q["answers"]:
a["embedding"] = embeddings.embed_query(
question_text + "\n" + a["body_markdown"]
)
# Cypher, the query language of Neo4j, is used to import the data
# https://neo4j.com/docs/getting-started/cypher-intro/
# https://neo4j.com/docs/cypher-cheat-sheet/5/auradb-enterprise/
import_query = """
UNWIND $data AS q
MERGE (question:Question {id:q.question_id})
ON CREATE SET question.title = q.title, question.link = q.link, question.score = q.score,
question.favorite_count = q.favorite_count, question.creation_date = datetime({epochSeconds: q.creation_date}),
question.body = q.body_markdown, question.embedding = q.embedding
FOREACH (tagName IN q.tags |
MERGE (tag:Tag {name:tagName})
MERGE (question)-[:TAGGED]->(tag)
)
FOREACH (a IN q.answers |
MERGE (question)<-[:ANSWERS]-(answer:Answer {id:a.answer_id})
SET answer.is_accepted = a.is_accepted,
answer.score = a.score,
answer.creation_date = datetime({epochSeconds:a.creation_date}),
answer.body = a.body_markdown,
answer.embedding = a.embedding
MERGE (answerer:User {id:coalesce(a.owner.user_id, "deleted")})
ON CREATE SET answerer.display_name = a.owner.display_name,
answerer.reputation= a.owner.reputation
MERGE (answer)<-[:PROVIDED]-(answerer)
)
WITH * WHERE NOT q.owner.user_id IS NULL
MERGE (owner:User {id:q.owner.user_id})
ON CREATE SET owner.display_name = q.owner.display_name,
owner.reputation = q.owner.reputation
MERGE (owner)-[:ASKED]->(question)
"""
neo4j_graph.query(import_query, {"data": data["items"]})
# Streamlit
def get_tag() -> str:
input_text = st.text_input(
"Which tag questions do you want to import?", value="neo4j"
)
return input_text
def get_pages():
col1, col2 = st.columns(2)
with col1:
num_pages = st.number_input(
"Number of pages (100 questions per page)", step=1, min_value=1
)
with col2:
start_page = st.number_input("Start page", step=1, min_value=1)
st.caption("Only questions with answers will be imported.")
return (int(num_pages), int(start_page))
def render_page():
datamodel_image = Image.open("./images/datamodel.png")
st.header("StackOverflow Loader")
st.subheader("Choose StackOverflow tags to load into Neo4j")
st.caption("Go to http://localhost:7474/ to explore the graph.")
user_input = get_tag()
num_pages, start_page = get_pages()
if st.button("Import", type="primary"):
with st.spinner("Loading... This might take a minute or two."):
try:
for page in range(1, num_pages + 1):
load_so_data(user_input, start_page + (page - 1))
st.success("Import successful", icon="✅")
st.caption("Data model")
st.image(datamodel_image)
st.caption("Go to http://localhost:7474/ to interact with the database")
except Exception as e:
st.error(f"Error: {e}", icon="🚨")
with st.expander("Highly ranked questions rather than tags?"):
if st.button("Import highly ranked questions"):
with st.spinner("Loading... This might take a minute or two."):
try:
load_high_score_so_data()
st.success("Import successful", icon="✅")
except Exception as e:
st.error(f"Error: {e}", icon="🚨")
render_page()
|