test1 / app.py
koey811's picture
Create app.py
5f59461 verified
import gradio as gr
import pdfplumber
from transformers import pipeline
import torch
# Check device availability
device = 0 if torch.cuda.is_available() else -1
# Initialize pipelines
summarizer = pipeline(
"summarization",
model="sshleifer/distilbart-cnn-12-6",
device=device
)
llm_pipeline = pipeline(
"text2text-generation",
model="google/flan-t5-small",
device=device
)
# Function to extract text from PDF (optimized for large PDFs)
def extract_pdf_text(pdf_file, max_pages=20):
text = ""
with pdfplumber.open(pdf_file) as pdf:
num_pages = min(len(pdf.pages), max_pages)
for i in range(num_pages):
page = pdf.pages[i]
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
# Summarize large text into manageable length
def summarize_text(text, max_chunk_length=1000):
sentences = text.split(". ")
current_chunk = ""
chunks = []
for sentence in sentences:
if len(current_chunk) + len(sentence) < max_chunk_length:
current_chunk += sentence + ". "
else:
chunks.append(current_chunk.strip())
current_chunk = sentence + ". "
if current_chunk:
chunks.append(current_chunk.strip())
summaries = []
for chunk in chunks:
summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
summaries.append(summary)
combined_summary = " ".join(summaries)
return combined_summary
# Create structured extraction prompt
def create_prompt(summary_text):
prompt = f"""
Extract clearly the following sustainability information from the provided summary:
- Company Name:
- Year of Report:
- Industry Sector:
- Total Emission Data:
- Energy Intensity:
- GHG Intensity:
Summary:
{summary_text}
"""
return prompt
# Main extraction function
def extract_sustainability_data(pdf_file):
# Step 1: extract limited text from PDF to manage resource usage
raw_text = extract_pdf_text(pdf_file, max_pages=30) # adjust max_pages as needed
# Step 2: summarize this extracted text to reduce token length
summary_text = summarize_text(raw_text)
# Step 3: LLM Prompt-based extraction on the summarized text
prompt = create_prompt(summary_text)
response = llm_pipeline(prompt, max_length=256, temperature=0.1, num_beams=3)[0]['generated_text']
# Parse structured response carefully
extracted_data = {}
fields = ["Company Name", "Year of Report", "Industry Sector", "Total Emission Data", "Energy Intensity", "GHG Intensity"]
for field in fields:
try:
field_value = response.split(f"{field}:")[1].split("\n")[0].strip()
except IndexError:
field_value = "Not Found"
extracted_data[field] = field_value if field_value else "Not Found"
return extracted_data
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# 🌱 Large Sustainability Report Extractor (Summarization + Prompt-guided LLM)")
pdf_input = gr.File(label="Upload Sustainability Report (PDF, Large files supported)")
output = gr.JSON(label="Extracted Sustainability Data")
btn = gr.Button("Extract Data")
btn.click(fn=extract_sustainability_data, inputs=pdf_input, outputs=output)
demo.launch()