michaelwechner commited on
Commit
4e9cef7
·
1 Parent(s): db4de01

logging improved

Browse files
Files changed (1) hide show
  1. kg_builder/src/graph_creation.py +36 -13
kg_builder/src/graph_creation.py CHANGED
@@ -7,26 +7,49 @@ from tqdm import tqdm
7
  # Load environment variables
8
  load_dotenv()
9
 
10
- # Define articles to load
 
 
 
 
11
  articles = {
12
- "Chemotherapy": "Chemotherapy",
13
  "Traffic Law": "Traffic laws in the United States"
14
  }
15
 
16
- def build_graph_for_article(article_name, category):
17
- print(f"Loading documents for: {article_name}")
18
- # Load and process the Wikipedia article
19
- raw_documents = WikipediaLoader(query=article_name).load()
 
 
 
 
 
 
 
 
 
 
 
20
  if not raw_documents:
21
- print(f"Failed to load content for {article_name}")
22
  return
23
-
24
- text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=96)
25
- documents = text_splitter.split_documents(raw_documents[:5]) # Only process the first 5 documents
26
 
27
- print("Building the knowledge graph...")
28
- for i, document in tqdm(enumerate(documents), total=len(documents)):
29
- extract_and_store_graph(document, category)
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  def main():
32
  for category, title in articles.items():
 
7
  # Load environment variables
8
  load_dotenv()
9
 
10
+ # Define articles / topics to load
11
+ #articles = {
12
+ # "Chemotherapy": "Chemotherapy",
13
+ # "Traffic Law": "Traffic laws in the United States"
14
+ #}
15
  articles = {
 
16
  "Traffic Law": "Traffic laws in the United States"
17
  }
18
 
19
+ def build_graph_for_article(query, category):
20
+ """
21
+ Build knowledge graph from loaded articles / documents of a particular topic
22
+ :param query: The query string to search on Wikipedia, e.g. "Traffic laws in the United States"
23
+ :param category: For example "Traffic Law"
24
+ :return:
25
+ """
26
+ load_max_documents = 5
27
+ #chunk_size=4096
28
+ #chunk_overlap=96
29
+ chunk_size=400
30
+ chunk_overlap=10
31
+
32
+ print(f"Loading document(s) from Wikipedia using query '{query}' ...")
33
+ raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
34
  if not raw_documents:
35
+ print(f"Failed to load content for query: {query}")
36
  return
 
 
 
37
 
38
+ print(f"{str(len(raw_documents))} document(s) loaded from Wikipedia.")
39
+ for doc in raw_documents:
40
+ print(f"Document: {doc.metadata['source']}")
41
+ #print(f"Document: {doc.page_content}")
42
+
43
+ print(f"Split document(s) into chunk(s) (Chunk size: {chunk_size}, Chunk overlap: {chunk_overlap}) ...")
44
+ text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
45
+ chunkDocs = text_splitter.split_documents(raw_documents[:load_max_documents]) # Only process the first 5 documents
46
+ print(f"{str(len(raw_documents))} document(s) split into {str(len(chunkDocs))} chunk(s)")
47
+
48
+ print(f"Building the knowledge graph for document(s) found by query '{query}' ...")
49
+ for i, chunkDoc in tqdm(enumerate(chunkDocs), total=len(chunkDocs)):
50
+ print(f"Extract data from chunk {str(i)} ...")
51
+ #print(f"Extract data from chunk {str(i)}: {chunkDoc.page_content}")
52
+ extract_and_store_graph(chunkDoc, category)
53
 
54
  def main():
55
  for category, title in articles.items():