Spaces:
Running
Running
Commit
·
f64978e
1
Parent(s):
3668f92
Update app.py
Browse files
app.py
CHANGED
|
@@ -59,14 +59,14 @@ class Paper:
|
|
| 59 |
self.sl = sl
|
| 60 |
self.section_names = [] # 段落标题
|
| 61 |
self.section_texts = {} # 段落内容
|
|
|
|
| 62 |
if title == '':
|
| 63 |
self.pdf = fitz.open(self.path) # pdf文档
|
| 64 |
self.title = self.get_title()
|
| 65 |
self.parse_pdf()
|
| 66 |
else:
|
| 67 |
self.title = title
|
| 68 |
-
self.authers = authers
|
| 69 |
-
self.abs = abs
|
| 70 |
self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
|
| 71 |
self.digit_num = [str(d+1) for d in range(10)]
|
| 72 |
self.first_image = ''
|
|
@@ -167,12 +167,13 @@ class Paper:
|
|
| 167 |
text = page.get_text("dict") # 获取页面上的文本信息
|
| 168 |
blocks = text["blocks"] # 获取文本块列表
|
| 169 |
for block in blocks: # 遍历每个文本块
|
| 170 |
-
if block["type"] == 0: # 如果是文字类型
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
|
|
|
| 176 |
max_font_sizes.sort()
|
| 177 |
print("max_font_sizes", max_font_sizes[-10:])
|
| 178 |
cur_title = ''
|
|
@@ -180,19 +181,20 @@ class Paper:
|
|
| 180 |
text = page.get_text("dict") # 获取页面上的文本信息
|
| 181 |
blocks = text["blocks"] # 获取文本块列表
|
| 182 |
for block in blocks: # 遍历每个文本块
|
| 183 |
-
if block["type"] == 0: # 如果是文字类型
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
cur_title
|
| 194 |
-
|
| 195 |
-
|
|
|
|
| 196 |
# break
|
| 197 |
title = cur_title.replace('\n', ' ')
|
| 198 |
return title
|
|
@@ -232,30 +234,12 @@ class Paper:
|
|
| 232 |
text = ''
|
| 233 |
text_list = []
|
| 234 |
section_dict = {}
|
| 235 |
-
|
| 236 |
-
# # 先处理Abstract章节
|
| 237 |
-
# for page_index, page in enumerate(self.pdf):
|
| 238 |
-
# cur_text = page.get_text()
|
| 239 |
-
# # 如果该页面是Abstract章节所在页面
|
| 240 |
-
# if page_index == list(self.section_page_dict.values())[0]:
|
| 241 |
-
# abs_str = "Abstract"
|
| 242 |
-
# # 获取Abstract章节的起始位置
|
| 243 |
-
# first_index = cur_text.find(abs_str)
|
| 244 |
-
# # 查找下一个章节的关键词,这里是Introduction
|
| 245 |
-
# intro_str = "Introduction"
|
| 246 |
-
# if intro_str in cur_text:
|
| 247 |
-
# second_index = cur_text.find(intro_str)
|
| 248 |
-
# elif intro_str.upper() in cur_text:
|
| 249 |
-
# second_index = cur_text.find(intro_str.upper())
|
| 250 |
-
# # 将Abstract章节内容加入字典中
|
| 251 |
-
# section_dict[abs_str] = cur_text[first_index+len(abs_str)+1:second_index].replace('-\n',
|
| 252 |
-
# '').replace('\n', ' ').split('I.')[0].split("II.")[0]
|
| 253 |
-
|
| 254 |
# 再处理其他章节:
|
| 255 |
text_list = [page.get_text() for page in self.pdf]
|
| 256 |
for sec_index, sec_name in enumerate(self.section_page_dict):
|
| 257 |
print(sec_index, sec_name, self.section_page_dict[sec_name])
|
| 258 |
-
if sec_index <= 0:
|
| 259 |
continue
|
| 260 |
else:
|
| 261 |
# 直接考虑后面的内容:
|
|
|
|
| 59 |
self.sl = sl
|
| 60 |
self.section_names = [] # 段落标题
|
| 61 |
self.section_texts = {} # 段落内容
|
| 62 |
+
self.abs = abs
|
| 63 |
if title == '':
|
| 64 |
self.pdf = fitz.open(self.path) # pdf文档
|
| 65 |
self.title = self.get_title()
|
| 66 |
self.parse_pdf()
|
| 67 |
else:
|
| 68 |
self.title = title
|
| 69 |
+
self.authers = authers
|
|
|
|
| 70 |
self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
|
| 71 |
self.digit_num = [str(d+1) for d in range(10)]
|
| 72 |
self.first_image = ''
|
|
|
|
| 167 |
text = page.get_text("dict") # 获取页面上的文本信息
|
| 168 |
blocks = text["blocks"] # 获取文本块列表
|
| 169 |
for block in blocks: # 遍历每个文本块
|
| 170 |
+
if block["type"] == 0 and len(block['lines']): # 如果是文字类型
|
| 171 |
+
if len(block["lines"][0]["spans"]):
|
| 172 |
+
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
|
| 173 |
+
max_font_sizes.append(font_size)
|
| 174 |
+
if font_size > max_font_size: # 如果字体大小大于当前最大值
|
| 175 |
+
max_font_size = font_size # 更新最大值
|
| 176 |
+
max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
|
| 177 |
max_font_sizes.sort()
|
| 178 |
print("max_font_sizes", max_font_sizes[-10:])
|
| 179 |
cur_title = ''
|
|
|
|
| 181 |
text = page.get_text("dict") # 获取页面上的文本信息
|
| 182 |
blocks = text["blocks"] # 获取文本块列表
|
| 183 |
for block in blocks: # 遍历每个文本块
|
| 184 |
+
if block["type"] == 0 and len(block['lines']): # 如果是文字类型
|
| 185 |
+
if len(block["lines"][0]["spans"]):
|
| 186 |
+
cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
|
| 187 |
+
font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
|
| 188 |
+
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
|
| 189 |
+
# print(font_size)
|
| 190 |
+
if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
|
| 191 |
+
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
| 192 |
+
if len(cur_string) > 4 and "arXiv" not in cur_string:
|
| 193 |
+
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
| 194 |
+
if cur_title == '' :
|
| 195 |
+
cur_title += cur_string
|
| 196 |
+
else:
|
| 197 |
+
cur_title += ' ' + cur_string
|
| 198 |
# break
|
| 199 |
title = cur_title.replace('\n', ' ')
|
| 200 |
return title
|
|
|
|
| 234 |
text = ''
|
| 235 |
text_list = []
|
| 236 |
section_dict = {}
|
| 237 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
# 再处理其他章节:
|
| 239 |
text_list = [page.get_text() for page in self.pdf]
|
| 240 |
for sec_index, sec_name in enumerate(self.section_page_dict):
|
| 241 |
print(sec_index, sec_name, self.section_page_dict[sec_name])
|
| 242 |
+
if sec_index <= 0 and self.abs:
|
| 243 |
continue
|
| 244 |
else:
|
| 245 |
# 直接考虑后面的内容:
|