Spaces:

jaibadachiya
/

knowledge_graph

Sleeping

App Files Files Community

jaibadachiya commited on Apr 24, 2025

Commit

a5dfac5

verified ·

1 Parent(s): 6cf56fc

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -93

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ import networkx as nx
 import logging
 from sklearn.feature_extraction.text import TfidfVectorizer
 import re
-import os
 # Set up logging configuration
 logging.basicConfig(level=logging.INFO)
@@ -20,98 +19,116 @@ def install_spacy_model():
         subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
 install_spacy_model()
-# Load the spaCy model after ensuring it's installed
 nlp = spacy.load("en_core_web_sm")
 # Neo4j credentials
-uri = os.environ.get("neo4j+s://ff701b1c.databases.neo4j.io")
-username = os.environ.get("neo4j")
-password = os.environ.get("BfZM7YRKpFz1b_V7acAmOtaSQHPU9xK03rJlfPep88g")
-# Connect to Neo4j to check the connection and then close it
-def connect_to_neo4j():
-    try:
-        driver = GraphDatabase.driver(uri, auth=(username, password))
-        logging.info("✅ Connected to Neo4j!")
-        driver.close()
-        logging.info("🔒 Neo4j driver closed.")
-    except Exception as e:
-        logging.error(f"❌ An error occurred while connecting to Neo4j: {e}", exc_info=True)
-        raise
-connect_to_neo4j()
-# Text Processing
-def load_and_clean_text(file_path: str) -> str:
-    with open(file_path, 'r', encoding='utf-8') as file:
-        text = file.read()
-    text = re.sub(r'\n+', ' ', text)
-    return re.sub(r'\s+', ' ', text).strip().lower()
-# TF-IDF Filtering
-def compute_tfidf_keywords(text: str, top_n=60):
-    vectorizer = TfidfVectorizer(stop_words='english')
-    X = vectorizer.fit_transform([text])
-    scores = zip(vectorizer.get_feature_names_out(), X.toarray()[0])
-    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
-    return {word for word, _ in sorted_scores[:top_n]}
-# Triple Extraction
-def get_full_phrase(token) -> str:
-    return ' '.join(tok.text for tok in token.subtree if tok.dep_ != 'punct').strip()
-def extract_rich_triples(doc, tfidf_keywords) -> list:
-    triples = []
-    for sent in doc.sents:
-        subjects = [tok for tok in sent if "subj" in tok.dep_]
-        objects = [tok for tok in sent if "obj" in tok.dep_]
-        verbs = [tok for tok in sent if tok.pos_ == "VERB"]
-        for subj in subjects:
-            for obj in objects:
-                for verb in verbs:
-                    s = get_full_phrase(subj)
-                    o = get_full_phrase(obj)
-                    if s.lower() in tfidf_keywords or o.lower() in tfidf_keywords:
-                        triples.append((s, verb.lemma_, o))
-    return triples
-# Graph Visualization
-def visualize_knowledge_graph(triples: list, title: str = "Knowledge Graph"):
-    G = nx.DiGraph()
-    for subj, pred, obj in triples:
-        G.add_node(subj, label='Subject')
-        G.add_node(obj, label='Object')
-        G.add_edge(subj, obj, label=pred)
-    pos = nx.spring_layout(G, k=1.2, seed=42)
-    node_colors = ['skyblue' if G.nodes[n]['label'] == 'Subject' else 'lightgreen' for n in G.nodes]
-    plt.figure(figsize=(16, 16))
-    nx.draw(G, pos, with_labels=True, node_size=1200, node_color=node_colors,
-            font_size=10, font_weight='bold', edge_color='gray', alpha=0.8)
-    nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): d['label'] for u, v, d in G.edges(data=True)},
-                                 font_size=8, font_color='red')
-    plt.title(title, fontsize=20)
-    plt.show()
-# Streamlit UI
-st.set_page_config(page_title="Knowledge Graph Generator", layout="wide")
-st.title("🧠 Knowledge Graph Generator")
-text_input = st.text_area("Paste your text here", height=200)
-if st.button("Generate Graph"):
-    if text_input:
-        try:
-            triples = extract_rich_triples(nlp(text_input), compute_tfidf_keywords(text_input))
-            logging.info(f"🧠 Extracted {len(triples)} filtered triples.")
-            for t in triples[:10]:
-                st.write("🔗", t)
-            # Final Visualization
-            visualize_knowledge_graph(triples, title="Filtered Knowledge Graph (TF-IDF)")
-        except Exception as e:
-            logging.error(f"❌ An error occurred: {e}", exc_info=True)
-            st.error("An error occurred. Please check the logs.")
-    else:
-        st.warning("Please enter some text.")

 import logging
 from sklearn.feature_extraction.text import TfidfVectorizer
 import re
 # Set up logging configuration
 logging.basicConfig(level=logging.INFO)
         subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
 install_spacy_model()
+# Load the model after ensuring it's installed
 nlp = spacy.load("en_core_web_sm")
 # Neo4j credentials
+uri = "neo4j+s://ff701b1c.databases.neo4j.io"
+username = "neo4j"
+password = "BfZM7YRKpFz1b_V7acAmOtaSQHPU9xK03rJlfPep88g"
+# Connect to Neo4j
+driver = None
+try:
+    driver = GraphDatabase.driver(uri, auth=(username, password))
+    logging.info("✅ Connected to Neo4j!")
+    def close_driver():
+        if driver:
+            driver.close()
+            logging.info("🔒 Neo4j driver closed.")
+    def create_entity(tx, name: str):
+        tx.run("MERGE (e:Entity {name: $name})", name=name)
+    def create_relationship(tx, subj: str, pred: str, obj: str):
+        tx.run("""
+            MERGE (a:Entity {name: $subj})
+            MERGE (b:Entity {name: $obj})
+            MERGE (a)-[:RELATION {name: $pred}]->(b)
+        """, subj=subj, pred=pred, obj=obj)
+    # Text Processing
+    def load_and_clean_text(file_path: str) -> str:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            text = file.read()
+        text = re.sub(r'\n+', ' ', text)
+        return re.sub(r'\s+', ' ', text).strip().lower()
+    # TF-IDF Filtering
+    def compute_tfidf_keywords(text: str, top_n=60):
+        vectorizer = TfidfVectorizer(stop_words='english')
+        X = vectorizer.fit_transform([text])
+        scores = zip(vectorizer.get_feature_names_out(), X.toarray()[0])
+        sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
+        return {word for word, _ in sorted_scores[:top_n]}
+    # Triple Extraction
+    def get_full_phrase(token) -> str:
+        return ' '.join(tok.text for tok in token.subtree if tok.dep_ != 'punct').strip()
+    def extract_rich_triples(doc, tfidf_keywords) -> list:
+        triples = []
+        for sent in doc.sents:
+            subjects = [tok for tok in sent if "subj" in tok.dep_]
+            objects = [tok for tok in sent if "obj" in tok.dep_]
+            verbs = [tok for tok in sent if tok.pos_ == "VERB"]
+            for subj in subjects:
+                for obj in objects:
+                    for verb in verbs:
+                        s = get_full_phrase(subj)
+                        o = get_full_phrase(obj)
+                        if s.lower() in tfidf_keywords or o.lower() in tfidf_keywords:
+                            triples.append((s, verb.lemma_, o))
+        return triples
+    # Graph Visualization
+    def visualize_knowledge_graph(triples: list, title: str = "Knowledge Graph"):
+        G = nx.DiGraph()
+        for subj, pred, obj in triples:
+            G.add_node(subj, label='Subject')
+            G.add_node(obj, label='Object')
+            G.add_edge(subj, obj, label=pred)
+        pos = nx.spring_layout(G, k=1.2, seed=42)
+        node_colors = ['skyblue' if G.nodes[n]['label'] == 'Subject' else 'lightgreen' for n in G.nodes]
+        plt.figure(figsize=(16, 16))
+        nx.draw(G, pos, with_labels=True, node_size=1200, node_color=node_colors,
+                font_size=10, font_weight='bold', edge_color='gray', alpha=0.8)
+        nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): d['label'] for u, v, d in G.edges(data=True)},
+                                     font_size=8, font_color='red')
+        plt.title(title, fontsize=20)
+        plt.show()
+    # === Main Execution ===
+    file_path = r'C:\Users\jaiba\Desktop\KNOWLEDGE GRAPH\data2.txt'
+    text = load_and_clean_text(file_path)
+    tfidf_keywords = compute_tfidf_keywords(text)
+    doc = nlp(text)
+    triples = extract_rich_triples(doc, tfidf_keywords)
+    logging.info(f"🧠 Extracted {len(triples)} filtered triples.")
+    for t in triples[:10]:  # Print only the top 10 triplets
+        print("🔗", t)
+    # === Push to Neo4j ===
+    with driver.session() as session:
+        for subj, pred, obj in triples:
+            session.execute_write(create_entity, subj)
+            session.execute_write(create_entity, obj)
+            session.execute_write(create_relationship, subj, pred, obj)
+    logging.info("📡 Triples successfully stored in Neo4j.")
+    print("📡 Triples successfully stored in Neo4j.")
+    # === Final Visualization ===
+    visualize_knowledge_graph(triples, title="Filtered Knowledge Graph (TF-IDF)")
+except Exception as e:
+    logging.error(f"❌ An error occurred: {e}", exc_info=True)
+finally:
+    if driver:
+        close_driver()