| | import run
|
| | import util
|
| | import docx
|
| | from docx.oxml.ns import qn
|
| | from docx.shared import Pt,RGBColor
|
| | import fitz
|
| | import os
|
| | from fpdf import FPDF
|
| | import run
|
| | from BERT_inference import BertClassificationModel
|
| |
|
| |
|
| | def text_dump_to_lines(text,topic_num,max_length):
|
| | lines = util.seg(text)
|
| | sentences = run.texClear(lines)
|
| | print(sentences)
|
| | keys, output = run.textToAb(sentences,lines,int(topic_num),int(max_length))
|
| | keysText = "\n".join(keys)
|
| | outputText = "\n".join(output)
|
| | print(keys,output)
|
| | return keysText, outputText, dump_to_txt(output), dump_to_docx(output), dump_to_pdf(output)
|
| |
|
| | def file_dump_to_lines(file,topic_num,max_length):
|
| | lines = []
|
| |
|
| | fileFormat = file.name.split(".")[-1]
|
| |
|
| | if fileFormat == "txt":
|
| | with open(file.name, encoding='utf-8') as f:
|
| | content = f.read()
|
| | lines = [x.strip() for x in content.split("\n") if x.strip()!='']
|
| | elif fileFormat == "docx":
|
| | doc=docx.Document(file.name)
|
| | paragraphs = doc.paragraphs
|
| | lines = [par.text for par in paragraphs]
|
| | elif fileFormat == "pdf":
|
| | pdf = fitz.open(file.name)
|
| | for page in pdf:
|
| | pageText = page.get_text("text")
|
| | lines.extend([x.strip() for x in pageText.split("\n") if x.strip()!=''])
|
| |
|
| | text = "\n".join(lines)
|
| | print(text)
|
| | keysText, outputText, txt_path, docx_path, pdf_path = text_dump_to_lines(text,topic_num,max_length)
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | return keysText, outputText, txt_path, docx_path, pdf_path
|
| |
|
| | def dump_to_txt(lines):
|
| | text = "\n".join(lines)
|
| | with open('temp.txt',mode="w",encoding="utf-8") as f:
|
| | f.write(text)
|
| | path = os.path.abspath('temp.txt')
|
| | return path
|
| |
|
| | def dump_to_docx(lines):
|
| | document = docx.Document()
|
| | document.styles['Normal'].font.name = u'宋体'
|
| | document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
|
| | document.styles['Normal'].font.size = Pt(14)
|
| | document.styles['Normal'].font.color.rgb = RGBColor(0,0,0)
|
| |
|
| |
|
| | paragraph = document.add_paragraph()
|
| | run = paragraph.add_run()
|
| |
|
| | run.font.name=u'Cambria'
|
| | run.font.color.rgb = RGBColor(0,0,0)
|
| | run._element.rPr.rFonts.set(qn('w:eastAsia'), u'Cambria')
|
| |
|
| | for line in lines:
|
| | document.add_paragraph(line)
|
| |
|
| | document.save(r'temp.docx')
|
| | path = os.path.abspath('temp.docx')
|
| |
|
| | return path
|
| |
|
| | def dump_to_pdf(lines):
|
| | pdf = FPDF()
|
| |
|
| | pdf.add_font('FZY3JW', '', 'FZY3JW.TTF', True)
|
| | pdf.add_page()
|
| |
|
| | pdf.set_font("FZY3JW", size=12)
|
| |
|
| | try:
|
| |
|
| | for line in lines:
|
| | str=line
|
| | num=len(str)
|
| | temp=45
|
| | for j in range(0,num,temp):
|
| | if(j+temp<num):
|
| | data=str[j:j+temp]
|
| | else:
|
| | data=str[j:num]
|
| | pdf.cell(0, 5, data, ln=1)
|
| | except Exception as e:
|
| | print(e)
|
| | pdf.output("temp.pdf")
|
| | path = os.path.abspath('temp.pdf')
|
| | return path
|
| |
|
| | if __name__ == "__main__":
|
| | with open('test.txt', 'r', encoding='utf-8') as f:
|
| | data = f.read()
|
| |
|
| | text_dump_to_lines(data,10,50) |