adrien.aribaut-gaudin
first push from pages 9 to 25
2e3ba97
from src.model.container import Container
from src.model.paragraph import Paragraph
from src.tools.index_creation import set_good_indexes
from src.tools.reader import get_pdf_title_styles
class Doc:
def __init__(self, path='', id_=None):
self.title = self.get_good_title(path)
self.id_ = id(self)
self.path = path
paragraphs = get_pdf_title_styles(path)
self.container = Container(paragraphs, father=self)
set_good_indexes(self.container)
self.blocks = self.get_blocks()
def get_good_title(self,path):
if '/' in path:
res = path.split('/')[-1]
if '\\' in path:
res = path.split('\\')[-1]
else:
res = path
return res
@property
def structure(self):
return self.container.structure
def get_blocks(self):
def from_list_to_str(index_list):
index_str = str(index_list[0])
for el in index_list[1:]:
index_str += '.' + str(el)
return index_str
blocks = self.container.blocks
# blocks = self.delete_duplicate()
for block in blocks:
block.doc = self.title
block.index = from_list_to_str(block.index)
print(block.index + ' ' + block.title)
return blocks
def delete_duplicate(self):
while self.found_duplicates(self.container.blocks):
for i in range(len(self.container.blocks) - 1):
if self.container.blocks[i].index == self.container.blocks[i + 1].index:
if self.container.blocks[i].index != []:
self.container.blocks[i].index.pop()
return self.container.blocks
def found_duplicates(self, blocks):
for i in range(len(blocks) - 1):
if blocks[i].index == blocks[i + 1].index:
return True
return False
"""
current_level = len(current_index)
if 0 < block.level:
if block.level == current_level:
current_index[-1] += 1
elif current_level < block.level:
current_index.append(1)
elif block.level < current_level:
current_index = current_index[:block.level]
current_index[-1] += 1
block.index = from_list_to_str(current_index)
else:
block.index = "0"
"""