| # -*-coding:utf-8 -*- | |
| import PyPDF2 | |
| from build_index.parser.base import BaseParser | |
| class PDFParser(BaseParser): | |
| def header_remove(self): | |
| # 删除研报的页头 | |
| pass | |
| def footnote_remove(self): | |
| # 删除研报的页脚 | |
| pass | |
| def parse_file(self, file): | |
| # store pages of | |
| text_list = [] | |
| with open(file, "rb") as fp: | |
| pdf = PyPDF2.PdfReader(fp) | |
| num_pages = len(pdf.pages) | |
| for page in range(num_pages-1): | |
| page_text = pdf.pages[page].extract_text() | |
| text_list.append(page_text) | |
| text = '\n'.join(text_list) | |
| metadata = {'source': file, 'pages': num_pages} | |
| return text, metadata | |