Commit
·
4e9cef7
1
Parent(s):
db4de01
logging improved
Browse files- kg_builder/src/graph_creation.py +36 -13
kg_builder/src/graph_creation.py
CHANGED
|
@@ -7,26 +7,49 @@ from tqdm import tqdm
|
|
| 7 |
# Load environment variables
|
| 8 |
load_dotenv()
|
| 9 |
|
| 10 |
-
# Define articles to load
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
articles = {
|
| 12 |
-
"Chemotherapy": "Chemotherapy",
|
| 13 |
"Traffic Law": "Traffic laws in the United States"
|
| 14 |
}
|
| 15 |
|
| 16 |
-
def build_graph_for_article(
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
if not raw_documents:
|
| 21 |
-
print(f"Failed to load content for {
|
| 22 |
return
|
| 23 |
-
|
| 24 |
-
text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=96)
|
| 25 |
-
documents = text_splitter.split_documents(raw_documents[:5]) # Only process the first 5 documents
|
| 26 |
|
| 27 |
-
print("
|
| 28 |
-
for
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
def main():
|
| 32 |
for category, title in articles.items():
|
|
|
|
| 7 |
# Load environment variables
|
| 8 |
load_dotenv()
|
| 9 |
|
| 10 |
+
# Define articles / topics to load
|
| 11 |
+
#articles = {
|
| 12 |
+
# "Chemotherapy": "Chemotherapy",
|
| 13 |
+
# "Traffic Law": "Traffic laws in the United States"
|
| 14 |
+
#}
|
| 15 |
articles = {
|
|
|
|
| 16 |
"Traffic Law": "Traffic laws in the United States"
|
| 17 |
}
|
| 18 |
|
| 19 |
+
def build_graph_for_article(query, category):
|
| 20 |
+
"""
|
| 21 |
+
Build knowledge graph from loaded articles / documents of a particular topic
|
| 22 |
+
:param query: The query string to search on Wikipedia, e.g. "Traffic laws in the United States"
|
| 23 |
+
:param category: For example "Traffic Law"
|
| 24 |
+
:return:
|
| 25 |
+
"""
|
| 26 |
+
load_max_documents = 5
|
| 27 |
+
#chunk_size=4096
|
| 28 |
+
#chunk_overlap=96
|
| 29 |
+
chunk_size=400
|
| 30 |
+
chunk_overlap=10
|
| 31 |
+
|
| 32 |
+
print(f"Loading document(s) from Wikipedia using query '{query}' ...")
|
| 33 |
+
raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
|
| 34 |
if not raw_documents:
|
| 35 |
+
print(f"Failed to load content for query: {query}")
|
| 36 |
return
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
+
print(f"{str(len(raw_documents))} document(s) loaded from Wikipedia.")
|
| 39 |
+
for doc in raw_documents:
|
| 40 |
+
print(f"Document: {doc.metadata['source']}")
|
| 41 |
+
#print(f"Document: {doc.page_content}")
|
| 42 |
+
|
| 43 |
+
print(f"Split document(s) into chunk(s) (Chunk size: {chunk_size}, Chunk overlap: {chunk_overlap}) ...")
|
| 44 |
+
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 45 |
+
chunkDocs = text_splitter.split_documents(raw_documents[:load_max_documents]) # Only process the first 5 documents
|
| 46 |
+
print(f"{str(len(raw_documents))} document(s) split into {str(len(chunkDocs))} chunk(s)")
|
| 47 |
+
|
| 48 |
+
print(f"Building the knowledge graph for document(s) found by query '{query}' ...")
|
| 49 |
+
for i, chunkDoc in tqdm(enumerate(chunkDocs), total=len(chunkDocs)):
|
| 50 |
+
print(f"Extract data from chunk {str(i)} ...")
|
| 51 |
+
#print(f"Extract data from chunk {str(i)}: {chunkDoc.page_content}")
|
| 52 |
+
extract_and_store_graph(chunkDoc, category)
|
| 53 |
|
| 54 |
def main():
|
| 55 |
for category, title in articles.items():
|