Commit
·
0393e86
1
Parent(s):
3de8751
use logging
Browse files
kg_builder/src/graph_creation.py
CHANGED
|
@@ -1,9 +1,15 @@
|
|
|
|
|
|
|
|
| 1 |
from langchain_community.document_loaders import WikipediaLoader
|
| 2 |
from langchain.text_splitter import TokenTextSplitter
|
| 3 |
from knowledge_graph_builder import extract_and_store_graph
|
| 4 |
from dotenv import load_dotenv
|
| 5 |
from tqdm import tqdm
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
# Load environment variables
|
| 8 |
load_dotenv()
|
| 9 |
|
|
@@ -32,25 +38,25 @@ def build_graph_for_article(query, data_source_name):
|
|
| 32 |
chunk_size=400
|
| 33 |
chunk_overlap=10
|
| 34 |
|
| 35 |
-
|
| 36 |
raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
|
| 37 |
if not raw_documents:
|
| 38 |
-
|
| 39 |
return
|
| 40 |
|
| 41 |
-
|
| 42 |
for doc in raw_documents:
|
| 43 |
-
|
| 44 |
#print(f"Document: {doc.page_content}")
|
| 45 |
|
| 46 |
-
|
| 47 |
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 48 |
chunkDocs = text_splitter.split_documents(raw_documents[:load_max_documents]) # Only process the first 5 documents
|
| 49 |
-
|
| 50 |
|
| 51 |
-
|
| 52 |
for i, chunkDoc in tqdm(enumerate(chunkDocs), total=len(chunkDocs)):
|
| 53 |
-
|
| 54 |
#print(f"Extract data from chunk {str(i)}: {chunkDoc.page_content}")
|
| 55 |
extract_and_store_graph(chunkDoc, data_source_name)
|
| 56 |
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
from langchain_community.document_loaders import WikipediaLoader
|
| 4 |
from langchain.text_splitter import TokenTextSplitter
|
| 5 |
from knowledge_graph_builder import extract_and_store_graph
|
| 6 |
from dotenv import load_dotenv
|
| 7 |
from tqdm import tqdm
|
| 8 |
|
| 9 |
+
logging.basicConfig(format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO)
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
logger.setLevel(logging.INFO)
|
| 12 |
+
|
| 13 |
# Load environment variables
|
| 14 |
load_dotenv()
|
| 15 |
|
|
|
|
| 38 |
chunk_size=400
|
| 39 |
chunk_overlap=10
|
| 40 |
|
| 41 |
+
logger.info(f"Loading document(s) from Wikipedia using query '{query}' ...")
|
| 42 |
raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
|
| 43 |
if not raw_documents:
|
| 44 |
+
logger.error(f"Failed to load content for query: {query}")
|
| 45 |
return
|
| 46 |
|
| 47 |
+
logger.info(f"{str(len(raw_documents))} document(s) loaded from Wikipedia.")
|
| 48 |
for doc in raw_documents:
|
| 49 |
+
logger.info(f"Document: {doc.metadata['source']}")
|
| 50 |
#print(f"Document: {doc.page_content}")
|
| 51 |
|
| 52 |
+
logger.info(f"Split document(s) into chunk(s) (Chunk size: {chunk_size}, Chunk overlap: {chunk_overlap}) ...")
|
| 53 |
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 54 |
chunkDocs = text_splitter.split_documents(raw_documents[:load_max_documents]) # Only process the first 5 documents
|
| 55 |
+
logger.info(f"{str(len(raw_documents))} document(s) split into {str(len(chunkDocs))} chunk(s)")
|
| 56 |
|
| 57 |
+
logger.info(f"Building the knowledge graph for document(s) found by query '{query}' ...")
|
| 58 |
for i, chunkDoc in tqdm(enumerate(chunkDocs), total=len(chunkDocs)):
|
| 59 |
+
logger.info(f"Extract data from chunk {str(i)} ...")
|
| 60 |
#print(f"Extract data from chunk {str(i)}: {chunkDoc.page_content}")
|
| 61 |
extract_and_store_graph(chunkDoc, data_source_name)
|
| 62 |
|