Spaces:
Runtime error
Runtime error
remove special chars (#15)
Browse files- buster/docparser.py +7 -5
buster/docparser.py
CHANGED
|
@@ -54,8 +54,10 @@ def get_all_documents(root_dir: str, base_url: str, max_section_length: int = 20
|
|
| 54 |
else:
|
| 55 |
section = parse_section(section_soup.children)
|
| 56 |
|
| 57 |
-
url
|
| 58 |
-
|
|
|
|
|
|
|
| 59 |
|
| 60 |
# If text is too long, split into chunks of equal sizes
|
| 61 |
if len(section) > max_section_length:
|
|
@@ -81,14 +83,14 @@ def get_all_documents(root_dir: str, base_url: str, max_section_length: int = 20
|
|
| 81 |
names = []
|
| 82 |
for file in files:
|
| 83 |
filepath = os.path.join(root_dir, file)
|
| 84 |
-
with open(filepath, "r") as
|
| 85 |
-
source =
|
| 86 |
|
| 87 |
soup = BeautifulSoup(source, "html.parser")
|
| 88 |
sections_file, urls_file, names_file = get_all_subsections(soup)
|
| 89 |
sections.extend(sections_file)
|
| 90 |
|
| 91 |
-
urls_file = [base_url +
|
| 92 |
urls.extend(urls_file)
|
| 93 |
|
| 94 |
names.extend(names_file)
|
|
|
|
| 54 |
else:
|
| 55 |
section = parse_section(section_soup.children)
|
| 56 |
|
| 57 |
+
# Remove special characters, plus newlines in some url and section names.
|
| 58 |
+
section = section.strip()
|
| 59 |
+
url = section_found["href"].strip().replace("\n", "")
|
| 60 |
+
name = section_found.parent.text.strip()[:-1].replace("\n", "")
|
| 61 |
|
| 62 |
# If text is too long, split into chunks of equal sizes
|
| 63 |
if len(section) > max_section_length:
|
|
|
|
| 83 |
names = []
|
| 84 |
for file in files:
|
| 85 |
filepath = os.path.join(root_dir, file)
|
| 86 |
+
with open(filepath, "r") as f:
|
| 87 |
+
source = f.read()
|
| 88 |
|
| 89 |
soup = BeautifulSoup(source, "html.parser")
|
| 90 |
sections_file, urls_file, names_file = get_all_subsections(soup)
|
| 91 |
sections.extend(sections_file)
|
| 92 |
|
| 93 |
+
urls_file = [base_url + file + url for url in urls_file]
|
| 94 |
urls.extend(urls_file)
|
| 95 |
|
| 96 |
names.extend(names_file)
|