Cpp4App_test / SEM /find_subtitle.py
HaochenGong
create
f1554a2
import csv
import os
import bs4
def find_title_Label(path):
a = 0
soup = bs4.BeautifulSoup(open(path,encoding='utf-8'), features="html.parser")
all_list = ["","","","","","",""]
list_index = ['h1','h2','h3','h4','h5','strong','b']
h1_list = soup.find_all('h1')
if len(h1_list) <= 2:
h1_list = None
try:
for h1 in h1_list:
all_list[0] += h1.text
except Exception:
a = 1
h2_list = soup.find_all('h2')
if len(h2_list) <= 2:
h2_list = None
try:
for h2 in h2_list:
all_list[1] += h2.text
except Exception:
a = 1
h3_list = soup.find_all('h3')
if len(h3_list) <= 2:
h3_list = None
try:
for h3 in h3_list:
all_list[2] += h3.text
except Exception:
a = 1
h4_list = soup.find_all('h4')
if len(h4_list) <= 2:
h4_list = None
try:
for h4 in h4_list:
all_list[3] += h4.text
except Exception:
a = 1
h5_list = soup.find_all('h5')
if len(h5_list) <= 2:
h5_list = None
try:
for h5 in h5_list:
all_list[4] += h5.text
except Exception:
a = 1
strong_list = soup.find_all('strong')
if len(strong_list) <= 2:
strong_list = None
try:
for st in strong_list:
all_list[5] += st.text
except Exception:
a = 1
b_list = soup.find_all('b')
if len(b_list) <= 2:
b_list = None
try:
for b in b_list:
all_list[6] += b.text
except Exception:
a = 1
long = 0
maxLongList = None
for list in all_list:
if list == None:
continue
clean_list = list.lower()
if "information" in clean_list and "collect" in clean_list:
return list_index[all_list.index(list)]
if "information" in clean_list and "use" in clean_list:
return list_index[all_list.index(list)]
if "change" in clean_list and "data" in clean_list:
return list_index[all_list.index(list)]
if len(list) > long:
long = len(list)
maxLongList = list
if maxLongList == None:
return "TitleError"
return list_index[all_list.index(maxLongList)]
def find_title_Label_with_html(file):
a = 0
soup = bs4.BeautifulSoup(file, features="html.parser")
all_list = ["","","","","","",""]
list_index = ['h1','h2','h3','h4','h5','strong','b']
h1_list = soup.find_all('h1')
if len(h1_list) <= 2:
h1_list = None
try:
for h1 in h1_list:
all_list[0] += h1.text
except Exception:
a = 1
h2_list = soup.find_all('h2')
if len(h2_list) <= 2:
h2_list = None
try:
for h2 in h2_list:
all_list[1] += h2.text
except Exception:
a = 1
h3_list = soup.find_all('h3')
if len(h3_list) <= 2:
h3_list = None
try:
for h3 in h3_list:
all_list[2] += h3.text
except Exception:
a = 1
h4_list = soup.find_all('h4')
if len(h4_list) <= 2:
h4_list = None
try:
for h4 in h4_list:
all_list[3] += h4.text
except Exception:
a = 1
h5_list = soup.find_all('h5')
if len(h5_list) <= 2:
h5_list = None
try:
for h5 in h5_list:
all_list[4] += h5.text
except Exception:
a = 1
strong_list = soup.find_all('strong')
if len(strong_list) <= 2:
strong_list = None
try:
for st in strong_list:
all_list[5] += st.text
except Exception:
a = 1
b_list = soup.find_all('b')
if len(b_list) <= 2:
b_list = None
try:
for b in b_list:
all_list[6] += b.text
except Exception:
a = 1
long = 0
maxLongList = None
for list in all_list:
if list == None:
continue
clean_list = list.lower()
if "information" in clean_list and "collect" in clean_list:
return list_index[all_list.index(list)]
if "information" in clean_list and "use" in clean_list:
return list_index[all_list.index(list)]
if "change" in clean_list and "data" in clean_list:
return list_index[all_list.index(list)]
if len(list) > long:
long = len(list)
maxLongList = list
if maxLongList == None:
return "TitleError"
return list_index[all_list.index(maxLongList)]