playground_pdfsum / summarizer.py
ngupta949's picture
Upload 3 files
ffb9e96 verified
raw
history blame contribute delete
881 Bytes
from transformers import pipeline
import PyPDF2
# Load summarization model
summarizer_pipeline = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
def extract_text_from_pdf(file):
# Check if file is a path (string) or file-like object
if isinstance(file, str):
with open(file, "rb") as f:
reader = PyPDF2.PdfReader(f)
text = "".join([page.extract_text() or "" for page in reader.pages])
else:
reader = PyPDF2.PdfReader(file)
text = "".join([page.extract_text() or "" for page in reader.pages])
return text.strip()
def summarize_text(text, max_length=130, min_length=30):
if len(text.strip()) == 0:
return "No valid text found in the PDF."
summary = summarizer_pipeline(text[:3000], max_length=max_length, min_length=min_length, do_sample=False)
return summary[0]['summary_text']