File size: 1,294 Bytes
22a285c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import fitz
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

def extract_text_from_pdf(file):
    text = ""
    pdf = fitz.open(stream=file.read(), filetype="pdf")
    for page in pdf:
        text += page.get_text()
    return text.strip().replace("\n", " ")

def chunk_text(text, chunk_size=450):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

def summarize_text(text):
    text = text.replace('\n',' ').strip()
    chunks = chunk_text(text)
    summaries = []

    for chunk in chunks:
      input_text = "summarize: " + chunk
      input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

      summary_ids = model.generate(
          input_ids,
          max_length=500,
          min_length=100,
          length_penalty=2.0,
          num_beams=4,
          early_stopping=True,
          do_sample = True
      )
      summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
      summaries.append(summary)

    # output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    # return output
    return ' '.join(summaries)