jaibadachiya commited on
Commit
a5dfac5
Β·
verified Β·
1 Parent(s): 6cf56fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -93
app.py CHANGED
@@ -7,7 +7,6 @@ import networkx as nx
7
  import logging
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  import re
10
- import os
11
 
12
  # Set up logging configuration
13
  logging.basicConfig(level=logging.INFO)
@@ -20,98 +19,116 @@ def install_spacy_model():
20
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
21
  install_spacy_model()
22
 
23
- # Load the spaCy model after ensuring it's installed
24
  nlp = spacy.load("en_core_web_sm")
25
 
26
  # Neo4j credentials
27
- uri = os.environ.get("neo4j+s://ff701b1c.databases.neo4j.io")
28
- username = os.environ.get("neo4j")
29
- password = os.environ.get("BfZM7YRKpFz1b_V7acAmOtaSQHPU9xK03rJlfPep88g")
30
-
31
- # Connect to Neo4j to check the connection and then close it
32
- def connect_to_neo4j():
33
- try:
34
- driver = GraphDatabase.driver(uri, auth=(username, password))
35
- logging.info("βœ… Connected to Neo4j!")
36
- driver.close()
37
- logging.info("πŸ”’ Neo4j driver closed.")
38
- except Exception as e:
39
- logging.error(f"❌ An error occurred while connecting to Neo4j: {e}", exc_info=True)
40
- raise
41
-
42
- connect_to_neo4j()
43
-
44
- # Text Processing
45
- def load_and_clean_text(file_path: str) -> str:
46
- with open(file_path, 'r', encoding='utf-8') as file:
47
- text = file.read()
48
- text = re.sub(r'\n+', ' ', text)
49
- return re.sub(r'\s+', ' ', text).strip().lower()
50
-
51
- # TF-IDF Filtering
52
- def compute_tfidf_keywords(text: str, top_n=60):
53
- vectorizer = TfidfVectorizer(stop_words='english')
54
- X = vectorizer.fit_transform([text])
55
- scores = zip(vectorizer.get_feature_names_out(), X.toarray()[0])
56
- sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
57
- return {word for word, _ in sorted_scores[:top_n]}
58
-
59
- # Triple Extraction
60
- def get_full_phrase(token) -> str:
61
- return ' '.join(tok.text for tok in token.subtree if tok.dep_ != 'punct').strip()
62
-
63
- def extract_rich_triples(doc, tfidf_keywords) -> list:
64
- triples = []
65
- for sent in doc.sents:
66
- subjects = [tok for tok in sent if "subj" in tok.dep_]
67
- objects = [tok for tok in sent if "obj" in tok.dep_]
68
- verbs = [tok for tok in sent if tok.pos_ == "VERB"]
69
- for subj in subjects:
70
- for obj in objects:
71
- for verb in verbs:
72
- s = get_full_phrase(subj)
73
- o = get_full_phrase(obj)
74
- if s.lower() in tfidf_keywords or o.lower() in tfidf_keywords:
75
- triples.append((s, verb.lemma_, o))
76
- return triples
77
-
78
- # Graph Visualization
79
- def visualize_knowledge_graph(triples: list, title: str = "Knowledge Graph"):
80
- G = nx.DiGraph()
81
- for subj, pred, obj in triples:
82
- G.add_node(subj, label='Subject')
83
- G.add_node(obj, label='Object')
84
- G.add_edge(subj, obj, label=pred)
85
-
86
- pos = nx.spring_layout(G, k=1.2, seed=42)
87
- node_colors = ['skyblue' if G.nodes[n]['label'] == 'Subject' else 'lightgreen' for n in G.nodes]
88
-
89
- plt.figure(figsize=(16, 16))
90
- nx.draw(G, pos, with_labels=True, node_size=1200, node_color=node_colors,
91
- font_size=10, font_weight='bold', edge_color='gray', alpha=0.8)
92
- nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): d['label'] for u, v, d in G.edges(data=True)},
93
- font_size=8, font_color='red')
94
- plt.title(title, fontsize=20)
95
- plt.show()
96
-
97
- # Streamlit UI
98
- st.set_page_config(page_title="Knowledge Graph Generator", layout="wide")
99
- st.title("🧠 Knowledge Graph Generator")
100
-
101
- text_input = st.text_area("Paste your text here", height=200)
102
-
103
- if st.button("Generate Graph"):
104
- if text_input:
105
- try:
106
- triples = extract_rich_triples(nlp(text_input), compute_tfidf_keywords(text_input))
107
- logging.info(f"🧠 Extracted {len(triples)} filtered triples.")
108
- for t in triples[:10]:
109
- st.write("πŸ”—", t)
110
-
111
- # Final Visualization
112
- visualize_knowledge_graph(triples, title="Filtered Knowledge Graph (TF-IDF)")
113
- except Exception as e:
114
- logging.error(f"❌ An error occurred: {e}", exc_info=True)
115
- st.error("An error occurred. Please check the logs.")
116
- else:
117
- st.warning("Please enter some text.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import logging
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  import re
 
10
 
11
  # Set up logging configuration
12
  logging.basicConfig(level=logging.INFO)
 
19
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
20
  install_spacy_model()
21
 
22
+ # Load the model after ensuring it's installed
23
  nlp = spacy.load("en_core_web_sm")
24
 
25
  # Neo4j credentials
26
+ uri = "neo4j+s://ff701b1c.databases.neo4j.io"
27
+ username = "neo4j"
28
+ password = "BfZM7YRKpFz1b_V7acAmOtaSQHPU9xK03rJlfPep88g"
29
+
30
+ # Connect to Neo4j
31
+ driver = None
32
+ try:
33
+ driver = GraphDatabase.driver(uri, auth=(username, password))
34
+ logging.info("βœ… Connected to Neo4j!")
35
+
36
+ def close_driver():
37
+ if driver:
38
+ driver.close()
39
+ logging.info("πŸ”’ Neo4j driver closed.")
40
+
41
+ def create_entity(tx, name: str):
42
+ tx.run("MERGE (e:Entity {name: $name})", name=name)
43
+
44
+ def create_relationship(tx, subj: str, pred: str, obj: str):
45
+ tx.run("""
46
+ MERGE (a:Entity {name: $subj})
47
+ MERGE (b:Entity {name: $obj})
48
+ MERGE (a)-[:RELATION {name: $pred}]->(b)
49
+ """, subj=subj, pred=pred, obj=obj)
50
+
51
+ # Text Processing
52
+ def load_and_clean_text(file_path: str) -> str:
53
+ with open(file_path, 'r', encoding='utf-8') as file:
54
+ text = file.read()
55
+ text = re.sub(r'\n+', ' ', text)
56
+ return re.sub(r'\s+', ' ', text).strip().lower()
57
+
58
+ # TF-IDF Filtering
59
+ def compute_tfidf_keywords(text: str, top_n=60):
60
+ vectorizer = TfidfVectorizer(stop_words='english')
61
+ X = vectorizer.fit_transform([text])
62
+ scores = zip(vectorizer.get_feature_names_out(), X.toarray()[0])
63
+ sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
64
+ return {word for word, _ in sorted_scores[:top_n]}
65
+
66
+ # Triple Extraction
67
+ def get_full_phrase(token) -> str:
68
+ return ' '.join(tok.text for tok in token.subtree if tok.dep_ != 'punct').strip()
69
+
70
+ def extract_rich_triples(doc, tfidf_keywords) -> list:
71
+ triples = []
72
+ for sent in doc.sents:
73
+ subjects = [tok for tok in sent if "subj" in tok.dep_]
74
+ objects = [tok for tok in sent if "obj" in tok.dep_]
75
+ verbs = [tok for tok in sent if tok.pos_ == "VERB"]
76
+ for subj in subjects:
77
+ for obj in objects:
78
+ for verb in verbs:
79
+ s = get_full_phrase(subj)
80
+ o = get_full_phrase(obj)
81
+ if s.lower() in tfidf_keywords or o.lower() in tfidf_keywords:
82
+ triples.append((s, verb.lemma_, o))
83
+ return triples
84
+
85
+ # Graph Visualization
86
+ def visualize_knowledge_graph(triples: list, title: str = "Knowledge Graph"):
87
+ G = nx.DiGraph()
88
+ for subj, pred, obj in triples:
89
+ G.add_node(subj, label='Subject')
90
+ G.add_node(obj, label='Object')
91
+ G.add_edge(subj, obj, label=pred)
92
+
93
+ pos = nx.spring_layout(G, k=1.2, seed=42)
94
+ node_colors = ['skyblue' if G.nodes[n]['label'] == 'Subject' else 'lightgreen' for n in G.nodes]
95
+
96
+ plt.figure(figsize=(16, 16))
97
+ nx.draw(G, pos, with_labels=True, node_size=1200, node_color=node_colors,
98
+ font_size=10, font_weight='bold', edge_color='gray', alpha=0.8)
99
+ nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): d['label'] for u, v, d in G.edges(data=True)},
100
+ font_size=8, font_color='red')
101
+ plt.title(title, fontsize=20)
102
+ plt.show()
103
+
104
+ # === Main Execution ===
105
+ file_path = r'C:\Users\jaiba\Desktop\KNOWLEDGE GRAPH\data2.txt'
106
+ text = load_and_clean_text(file_path)
107
+ tfidf_keywords = compute_tfidf_keywords(text)
108
+
109
+ doc = nlp(text)
110
+
111
+ triples = extract_rich_triples(doc, tfidf_keywords)
112
+ logging.info(f"🧠 Extracted {len(triples)} filtered triples.")
113
+ for t in triples[:10]: # Print only the top 10 triplets
114
+ print("πŸ”—", t)
115
+
116
+ # === Push to Neo4j ===
117
+ with driver.session() as session:
118
+ for subj, pred, obj in triples:
119
+ session.execute_write(create_entity, subj)
120
+ session.execute_write(create_entity, obj)
121
+ session.execute_write(create_relationship, subj, pred, obj)
122
+
123
+ logging.info("πŸ“‘ Triples successfully stored in Neo4j.")
124
+ print("πŸ“‘ Triples successfully stored in Neo4j.")
125
+
126
+ # === Final Visualization ===
127
+ visualize_knowledge_graph(triples, title="Filtered Knowledge Graph (TF-IDF)")
128
+
129
+ except Exception as e:
130
+ logging.error(f"❌ An error occurred: {e}", exc_info=True)
131
+
132
+ finally:
133
+ if driver:
134
+ close_driver()