Spaces:
Runtime error
Runtime error
| from typing import Dict, List, Tuple, Optional | |
| from tqdm import tqdm | |
| from transformers import PegasusForConditionalGeneration, PegasusTokenizer | |
| from src.text_extractor import TextExtractor | |
| from mdutils.mdutils import MdUtils | |
| import torch | |
| import fitz | |
| import copy | |
| class Summarizer(): | |
| def __init__(self, model_name: str): | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.tokenizer = PegasusTokenizer.from_pretrained(model_name) | |
| self.model = PegasusForConditionalGeneration.from_pretrained(model_name).to(self.device) | |
| self.preprocess = TextExtractor() | |
| def extract_text(self, document: object) -> Dict[str, List[Tuple[str, str]]]: | |
| doc = fitz.open(document) | |
| self.filename = doc.name.split('/')[-1].split('.')[0] | |
| font_counts, styles = self.preprocess.get_font_info(doc, granularity=False) | |
| size_tag = self.preprocess.get_font_tags(font_counts, styles) | |
| texts = self.preprocess.assign_tags(doc, size_tag) | |
| slide_content = self.preprocess.get_slides(texts) | |
| return slide_content | |
| def __call__(self, slides: Dict[str, List[Tuple[str, str]]]) -> Dict[str, List[Tuple[str, str]]]: | |
| summarized_slides = copy.deepcopy(slides) | |
| for page, contents in tqdm(summarized_slides.items()): | |
| for idx, (tag, content) in enumerate(contents): | |
| if tag.startswith('p'): | |
| try: | |
| input = self.tokenizer(content, truncation=True, padding="longest", return_tensors="pt").to(self.device) | |
| tensor = self.model.generate(**input) | |
| summary = self.tokenizer.batch_decode(tensor, skip_special_tokens=True)[0] | |
| contents[idx] = (tag, summary) | |
| except Exception as e: | |
| print(f"Summarization Fails, Error: {e}") | |
| return summarized_slides | |
| def convert2markdown(self, summarized_slides: Dict[str, List[Tuple[str, str]]], target_path: Optional[str]=None) -> str: | |
| filename = self.filename | |
| if target_path: | |
| filename = target_path | |
| mdFile = MdUtils(file_name=filename) | |
| for k, v in summarized_slides.items(): | |
| mdFile.new_line('---\n') | |
| for section in v: | |
| tag = section[0] | |
| content = section[1] | |
| if tag.startswith('h'): | |
| try: | |
| mdFile.new_header(level=int(tag[1]), title=content) | |
| except: | |
| continue | |
| if tag == 'p': | |
| contents = content.split('<n>') | |
| for content in contents: | |
| mdFile.new_line(f"{content}\n") | |
| markdown = mdFile.create_md_file() | |
| return markdown | |
| def remove_leading_empty_lines(self, file_path) -> None: | |
| with open(file_path, 'r') as file: | |
| lines = file.readlines() | |
| non_empty_lines = [] | |
| found_first_word = False | |
| for line in lines: | |
| stripped_line = line.strip() | |
| if stripped_line and not found_first_word: | |
| found_first_word = True | |
| if found_first_word or stripped_line: | |
| non_empty_lines.append(line) | |
| with open(file_path, 'w') as file: | |
| file.writelines(non_empty_lines) | |
| return | |