Spaces:

jaibadachiya
/

knowledge_graph

Sleeping

App Files Files Community

knowledge_graph / app.py

jaibadachiya

Update app.py

c641539 verified 9 months ago

raw

history blame contribute delete

3.74 kB

	# app.py

	import streamlit as st
	import spacy
	import subprocess
	from neo4j import GraphDatabase
	import matplotlib.pyplot as plt
	import networkx as nx
	from sklearn.feature_extraction.text import TfidfVectorizer

	# === Ensure spaCy model is installed ===
	def install_spacy_model():
	try:
	spacy.load("en_core_web_sm")
	except OSError:
	subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
	spacy.load("en_core_web_sm")

	install_spacy_model()

	# Load spaCy model
	nlp = spacy.load("en_core_web_sm")

	# === Neo4j credentials ===
	NEO4J_URI = "neo4j+s://ff701b1c.databases.neo4j.io"
	NEO4J_USERNAME = "neo4j"
	NEO4J_PASSWORD = "BfZM7YRKpFz1b_V7acAmOtaSQHPU9xK03rJlfPep88g"

	def get_neo4j_driver():
	try:
	driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
	return driver
	except Exception as e:
	st.error(f"Failed to connect to Neo4j: {e}")
	return None

	# === TF-IDF Filtering (Optional for noise reduction) ===
	def compute_tfidf_keywords(text: str, top_n=100):
	vectorizer = TfidfVectorizer(stop_words='english')
	X = vectorizer.fit_transform([text])
	scores = zip(vectorizer.get_feature_names_out(), X.toarray()[0])
	sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
	return {word for word, _ in sorted_scores[:top_n]}

	# === Triple Extraction ===
	def extract_triples(text, use_tfidf=False):
	doc = nlp(text)
	tfidf_keywords = compute_tfidf_keywords(text) if use_tfidf else None
	triples = []

	for sent in doc.sents:
	subject = ""
	obj = ""
	verb = ""

	noun_chunks = list(sent.noun_chunks)
	root = [token for token in sent if token.dep_ == "ROOT"]
	if root:
	verb = root[0].lemma_

	for chunk in noun_chunks:
	if chunk.root.dep_ in ("nsubj", "nsubjpass") and not subject:
	subject = chunk.text
	elif chunk.root.dep_ in ("dobj", "pobj", "attr") and not obj:
	obj = chunk.text

	if subject and verb and obj:
	if tfidf_keywords:
	if subject.lower() in tfidf_keywords or obj.lower() in tfidf_keywords:
	triples.append((subject.strip(), verb.strip(), obj.strip()))
	else:
	triples.append((subject.strip(), verb.strip(), obj.strip()))

	return triples

	# === Visualization Function ===
	def show_graph(triples):
	if not triples:
	st.warning("No triples found to visualize.")
	return

	G = nx.DiGraph()
	for s, p, o in triples:
	G.add_node(s)
	G.add_node(o)
	G.add_edge(s, o, label=p)

	pos = nx.spring_layout(G, seed=42) # fixed layout
	plt.figure(figsize=(10, 8))
	nx.draw(G, pos, with_labels=True, node_color='skyblue', node_size=2000, font_size=10, edge_color='gray')
	nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): d['label'] for u, v, d in G.edges(data=True)})
	st.pyplot(plt.gcf())
	plt.clf()

	# === Streamlit UI ===
	st.title("🧠 Knowledge Graph Generator")

	text_input = st.text_area("Paste your text here:", height=200)
	use_tfidf = st.checkbox("Use TF-IDF filtering (Optional: Recommended for large texts)")

	if st.button("Generate Graph"):
	if text_input:
	all_triples = extract_triples(text_input, use_tfidf=use_tfidf)

	if all_triples:
	st.subheader("🔗 Extracted Triples:")
	for triple in all_triples:
	st.markdown(f"- ({triple[0]} → {triple[1]} → {triple[2]})")

	show_graph(all_triples)
	else:
	st.warning("No valid triples could be extracted. Try different text.")
	else:
	st.warning("Please enter some text.")