enoch10jason's picture
Moved FastAPI app files to root for Hugging Face
22a285c
import fitz
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
def extract_text_from_pdf(file):
text = ""
pdf = fitz.open(stream=file.read(), filetype="pdf")
for page in pdf:
text += page.get_text()
return text.strip().replace("\n", " ")
def chunk_text(text, chunk_size=450):
words = text.split()
return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
def summarize_text(text):
text = text.replace('\n',' ').strip()
chunks = chunk_text(text)
summaries = []
for chunk in chunks:
input_text = "summarize: " + chunk
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = model.generate(
input_ids,
max_length=500,
min_length=100,
length_penalty=2.0,
num_beams=4,
early_stopping=True,
do_sample = True
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summaries.append(summary)
# output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# return output
return ' '.join(summaries)