docurizer / docurizzer_colab.py
the-carnage's picture
first push
066352b
from transformers import pipeline
from PIL import Image
import pytesseract
import pdfplumber
print("Loading summarization model...")
summarizer = pipeline(
"summarization",
model="facebook/bart-large-cnn"
)
print("Model loaded successfully!")
def extract_text_from_image(image_path):
image = Image.open(image_path)
text = pytesseract.image_to_string(image)
return text
def extract_text_from_pdf(pdf_path):
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
def summarize_text(text, max_length=150, min_length=40):
input_text = text[:4000] if len(text) > 4000 else text
output = summarizer(
input_text,
max_length=max_length,
min_length=min_length,
do_sample=False
)
return output[0]["summary_text"]
document = """
How is the air quality in your city Delhi today? Will there be an increase or decrease
in pollution levels? Can you breathe easily, or will you face breathing difficulties?
What is the current Air Quality Index (AQI) in Delhi, and what is the air pollution level like?
Additionally, what's the weather forecast for Delhi today and in the coming days?
For up-to-date data on Delhi's air quality and weather conditions, you can visit
IndiaToday's website. They provide AQI information for major cities across the country,
including Delhi, allowing you to assess the air quality in different regions.
Furthermore, you'll find news related to pollution on this page, giving you insights
into the reasons behind any fluctuations in Delhi's pollution levels.
"""
print("\n" + "="*50)
print("ORIGINAL TEXT:")
print("="*50)
print(document)
print("\n" + "="*50)
print("SUMMARY:")
print("="*50)
summary = summarize_text(document)
print(summary)