Commit
·
6feb084
1
Parent(s):
9a0d22f
use UnstructuredHTMLLoader
Browse files- kg_builder/src/graph_creation.py +14 -10
kg_builder/src/graph_creation.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import logging
|
| 2 |
|
| 3 |
-
from langchain_community.document_loaders import WikipediaLoader
|
| 4 |
from langchain.text_splitter import TokenTextSplitter
|
| 5 |
from knowledge_graph_builder import extract_and_store_graph
|
| 6 |
from dotenv import load_dotenv
|
|
@@ -15,18 +15,21 @@ load_dotenv()
|
|
| 15 |
|
| 16 |
# IMPORTANT: Make sure data source names match with values inside api_connections.py
|
| 17 |
# Define articles / topics to load
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
# Switzerland: https://www.fedlex.admin.ch/eli/cc/1962/1364_1409_1420/de
|
| 23 |
# Connecticut: https://en.wikipedia.org/wiki/Transportation_in_Connecticut#Rules_of_the_road
|
| 24 |
articles = {
|
| 25 |
"Traffic Law": "Traffic laws in the United States"
|
| 26 |
}
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
def build_graph_for_article(query, data_source_name):
|
| 32 |
"""
|
|
@@ -43,7 +46,8 @@ def build_graph_for_article(query, data_source_name):
|
|
| 43 |
|
| 44 |
if data_source_name == "SquirroDocs":
|
| 45 |
logger.info(f"Loading document(s) from public website {query} ...")
|
| 46 |
-
|
|
|
|
| 47 |
else:
|
| 48 |
logger.info(f"Loading document(s) from Wikipedia using query '{query}' ...")
|
| 49 |
raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
|
|
@@ -52,7 +56,7 @@ def build_graph_for_article(query, data_source_name):
|
|
| 52 |
logger.error(f"Failed to load content for Data Source '{data_source_name}'!")
|
| 53 |
return
|
| 54 |
|
| 55 |
-
logger.info(f"{str(len(raw_documents))} document(s) loaded
|
| 56 |
for doc in raw_documents:
|
| 57 |
logger.info(f"Document: {doc.metadata['source']}")
|
| 58 |
#print(f"Document: {doc.page_content}")
|
|
|
|
| 1 |
import logging
|
| 2 |
|
| 3 |
+
from langchain_community.document_loaders import WikipediaLoader, UnstructuredHTMLLoader
|
| 4 |
from langchain.text_splitter import TokenTextSplitter
|
| 5 |
from knowledge_graph_builder import extract_and_store_graph
|
| 6 |
from dotenv import load_dotenv
|
|
|
|
| 15 |
|
| 16 |
# IMPORTANT: Make sure data source names match with values inside api_connections.py
|
| 17 |
# Define articles / topics to load
|
| 18 |
+
articlesDISABLED = {
|
| 19 |
+
"Chemotherapy": "Chemotherapy",
|
| 20 |
+
"Traffic Law": "Traffic laws in the United States"
|
| 21 |
+
}
|
| 22 |
# Switzerland: https://www.fedlex.admin.ch/eli/cc/1962/1364_1409_1420/de
|
| 23 |
# Connecticut: https://en.wikipedia.org/wiki/Transportation_in_Connecticut#Rules_of_the_road
|
| 24 |
articles = {
|
| 25 |
"Traffic Law": "Traffic laws in the United States"
|
| 26 |
}
|
| 27 |
+
articlesDISABLED = {
|
| 28 |
+
"SquirroDocs": "https://docs.squirro.com/en/latest/technical/getting-started.html"
|
| 29 |
+
}
|
| 30 |
+
articlesDISABLED = {
|
| 31 |
+
"SquirroDocs": "/Users/michaelwechner/Desktop/docs.squirro.com_en_latest_technical_getting-started.html"
|
| 32 |
+
}
|
| 33 |
|
| 34 |
def build_graph_for_article(query, data_source_name):
|
| 35 |
"""
|
|
|
|
| 46 |
|
| 47 |
if data_source_name == "SquirroDocs":
|
| 48 |
logger.info(f"Loading document(s) from public website {query} ...")
|
| 49 |
+
loader = UnstructuredHTMLLoader(query)
|
| 50 |
+
raw_documents = loader.load()
|
| 51 |
else:
|
| 52 |
logger.info(f"Loading document(s) from Wikipedia using query '{query}' ...")
|
| 53 |
raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
|
|
|
|
| 56 |
logger.error(f"Failed to load content for Data Source '{data_source_name}'!")
|
| 57 |
return
|
| 58 |
|
| 59 |
+
logger.info(f"{str(len(raw_documents))} document(s) loaded.")
|
| 60 |
for doc in raw_documents:
|
| 61 |
logger.info(f"Document: {doc.metadata['source']}")
|
| 62 |
#print(f"Document: {doc.page_content}")
|