import gradio as gr import pdfplumber from transformers import pipeline import torch # Check device availability device = 0 if torch.cuda.is_available() else -1 # Initialize pipelines summarizer = pipeline( "summarization", model="sshleifer/distilbart-cnn-12-6", device=device ) llm_pipeline = pipeline( "text2text-generation", model="google/flan-t5-small", device=device ) # Function to extract text from PDF (optimized for large PDFs) def extract_pdf_text(pdf_file, max_pages=20): text = "" with pdfplumber.open(pdf_file) as pdf: num_pages = min(len(pdf.pages), max_pages) for i in range(num_pages): page = pdf.pages[i] page_text = page.extract_text() if page_text: text += page_text + "\n" return text # Summarize large text into manageable length def summarize_text(text, max_chunk_length=1000): sentences = text.split(". ") current_chunk = "" chunks = [] for sentence in sentences: if len(current_chunk) + len(sentence) < max_chunk_length: current_chunk += sentence + ". " else: chunks.append(current_chunk.strip()) current_chunk = sentence + ". " if current_chunk: chunks.append(current_chunk.strip()) summaries = [] for chunk in chunks: summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] summaries.append(summary) combined_summary = " ".join(summaries) return combined_summary # Create structured extraction prompt def create_prompt(summary_text): prompt = f""" Extract clearly the following sustainability information from the provided summary: - Company Name: - Year of Report: - Industry Sector: - Total Emission Data: - Energy Intensity: - GHG Intensity: Summary: {summary_text} """ return prompt # Main extraction function def extract_sustainability_data(pdf_file): # Step 1: extract limited text from PDF to manage resource usage raw_text = extract_pdf_text(pdf_file, max_pages=30) # adjust max_pages as needed # Step 2: summarize this extracted text to reduce token length summary_text = summarize_text(raw_text) # Step 3: LLM Prompt-based extraction on the summarized text prompt = create_prompt(summary_text) response = llm_pipeline(prompt, max_length=256, temperature=0.1, num_beams=3)[0]['generated_text'] # Parse structured response carefully extracted_data = {} fields = ["Company Name", "Year of Report", "Industry Sector", "Total Emission Data", "Energy Intensity", "GHG Intensity"] for field in fields: try: field_value = response.split(f"{field}:")[1].split("\n")[0].strip() except IndexError: field_value = "Not Found" extracted_data[field] = field_value if field_value else "Not Found" return extracted_data # Gradio UI with gr.Blocks() as demo: gr.Markdown("# 🌱 Large Sustainability Report Extractor (Summarization + Prompt-guided LLM)") pdf_input = gr.File(label="Upload Sustainability Report (PDF, Large files supported)") output = gr.JSON(label="Extracted Sustainability Data") btn = gr.Button("Extract Data") btn.click(fn=extract_sustainability_data, inputs=pdf_input, outputs=output) demo.launch()