jaibadachiya commited on
Commit
0de17ce
·
verified ·
1 Parent(s): 4fcbe36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -19
app.py CHANGED
@@ -15,7 +15,7 @@ def install_spacy_model():
15
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
16
  install_spacy_model()
17
 
18
- # Load the spaCy model
19
  nlp = spacy.load("en_core_web_sm")
20
 
21
  # === Neo4j credentials ===
@@ -27,29 +27,38 @@ password = "BfZM7YRKpFz1b_V7acAmOtaSQHPU9xK03rJlfPep88g"
27
  driver = GraphDatabase.driver(uri, auth=(username, password))
28
 
29
  # === TF-IDF Filtering ===
30
- def compute_tfidf_keywords(text: str, top_n=60):
31
  vectorizer = TfidfVectorizer(stop_words='english')
32
  X = vectorizer.fit_transform([text])
33
  scores = zip(vectorizer.get_feature_names_out(), X.toarray()[0])
34
  sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
35
  return {word for word, _ in sorted_scores[:top_n]}
36
 
37
- # === Triple extraction with TF-IDF filtering and limit to 10 ===
38
  def extract_triples(text):
39
  doc = nlp(text)
40
  tfidf_keywords = compute_tfidf_keywords(text)
41
  triples = []
 
42
  for sent in doc.sents:
43
- subjects = [tok for tok in sent if "subj" in tok.dep_]
44
- verbs = [tok for tok in sent if tok.pos_ == "VERB"]
45
- objects = [tok for tok in sent if "obj" in tok.dep_]
46
- for subj in subjects:
47
- for verb in verbs:
48
- for obj in objects:
49
- if (subj.text.lower() in tfidf_keywords or
50
- verb.lemma_.lower() in tfidf_keywords or
51
- obj.text.lower() in tfidf_keywords):
52
- triples.append((subj.text, verb.lemma_, obj.text))
 
 
 
 
 
 
 
 
53
  return triples
54
 
55
  # === Visualization Function ===
@@ -62,20 +71,24 @@ def show_graph(triples):
62
  pos = nx.spring_layout(G)
63
  plt.figure(figsize=(10, 8))
64
  nx.draw(G, pos, with_labels=True, node_color='skyblue', node_size=2000, font_size=10)
65
- nx.draw_networkx_edge_labels(G, pos, edge_labels={(u,v):d['label'] for u,v,d in G.edges(data=True)})
66
  st.pyplot(plt)
67
 
68
  # === Streamlit UI ===
69
- st.title("🧠 Knowledge Graph Generator with TF-IDF Filtering")
70
 
71
  text_input = st.text_area("Paste your text here", height=200)
72
 
73
  if st.button("Generate Graph"):
74
  if text_input:
75
- triples = extract_triples(text_input)
76
- st.write("### Extracted Triples (Top 10 filtered by TF-IDF):")
77
- for t in triples:
 
 
78
  st.write("🔗", t)
79
- show_graph(triples)
 
 
80
  else:
81
  st.warning("Please enter some text.")
 
15
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
16
  install_spacy_model()
17
 
18
+ # Load spaCy model
19
  nlp = spacy.load("en_core_web_sm")
20
 
21
  # === Neo4j credentials ===
 
27
  driver = GraphDatabase.driver(uri, auth=(username, password))
28
 
29
  # === TF-IDF Filtering ===
30
+ def compute_tfidf_keywords(text: str, top_n=100):
31
  vectorizer = TfidfVectorizer(stop_words='english')
32
  X = vectorizer.fit_transform([text])
33
  scores = zip(vectorizer.get_feature_names_out(), X.toarray()[0])
34
  sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
35
  return {word for word, _ in sorted_scores[:top_n]}
36
 
37
+ # === Enhanced Triple Extraction ===
38
  def extract_triples(text):
39
  doc = nlp(text)
40
  tfidf_keywords = compute_tfidf_keywords(text)
41
  triples = []
42
+
43
  for sent in doc.sents:
44
+ subject = ""
45
+ obj = ""
46
+ verb = ""
47
+
48
+ noun_chunks = list(sent.noun_chunks)
49
+ root = [token for token in sent if token.dep_ == "ROOT"]
50
+ if root:
51
+ verb = root[0].lemma_
52
+
53
+ for chunk in noun_chunks:
54
+ if chunk.root.dep_ in ("nsubj", "nsubjpass"):
55
+ subject = chunk.text
56
+ elif chunk.root.dep_ in ("dobj", "pobj", "attr"):
57
+ obj = chunk.text
58
+
59
+ if subject and verb and obj:
60
+ triples.append((subject.strip(), verb.strip(), obj.strip()))
61
+
62
  return triples
63
 
64
  # === Visualization Function ===
 
71
  pos = nx.spring_layout(G)
72
  plt.figure(figsize=(10, 8))
73
  nx.draw(G, pos, with_labels=True, node_color='skyblue', node_size=2000, font_size=10)
74
+ nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): d['label'] for u, v, d in G.edges(data=True)})
75
  st.pyplot(plt)
76
 
77
  # === Streamlit UI ===
78
+ st.title("🧠 Knowledge Graph Generator (Enhanced with TF-IDF & Chunking)")
79
 
80
  text_input = st.text_area("Paste your text here", height=200)
81
 
82
  if st.button("Generate Graph"):
83
  if text_input:
84
+ all_triples = extract_triples(text_input)
85
+
86
+ # Display only the first 10
87
+ st.write("### Extracted Triples (showing top 10)")
88
+ for t in all_triples[:10]:
89
  st.write("🔗", t)
90
+
91
+ # Visualize all triples
92
+ show_graph(all_triples)
93
  else:
94
  st.warning("Please enter some text.")