| | import gradio as gr |
| | import pdfplumber |
| | from transformers import pipeline |
| | import torch |
| |
|
| | |
| | device = 0 if torch.cuda.is_available() else -1 |
| |
|
| | |
| | summarizer = pipeline( |
| | "summarization", |
| | model="sshleifer/distilbart-cnn-12-6", |
| | device=device |
| | ) |
| |
|
| | llm_pipeline = pipeline( |
| | "text2text-generation", |
| | model="google/flan-t5-small", |
| | device=device |
| | ) |
| |
|
| | |
| | def extract_pdf_text(pdf_file, max_pages=20): |
| | text = "" |
| | with pdfplumber.open(pdf_file) as pdf: |
| | num_pages = min(len(pdf.pages), max_pages) |
| | for i in range(num_pages): |
| | page = pdf.pages[i] |
| | page_text = page.extract_text() |
| | if page_text: |
| | text += page_text + "\n" |
| | return text |
| |
|
| | |
| | def summarize_text(text, max_chunk_length=1000): |
| | sentences = text.split(". ") |
| | current_chunk = "" |
| | chunks = [] |
| | for sentence in sentences: |
| | if len(current_chunk) + len(sentence) < max_chunk_length: |
| | current_chunk += sentence + ". " |
| | else: |
| | chunks.append(current_chunk.strip()) |
| | current_chunk = sentence + ". " |
| | if current_chunk: |
| | chunks.append(current_chunk.strip()) |
| | |
| | summaries = [] |
| | for chunk in chunks: |
| | summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] |
| | summaries.append(summary) |
| | |
| | combined_summary = " ".join(summaries) |
| | return combined_summary |
| |
|
| | |
| | def create_prompt(summary_text): |
| | prompt = f""" |
| | Extract clearly the following sustainability information from the provided summary: |
| | |
| | - Company Name: |
| | - Year of Report: |
| | - Industry Sector: |
| | - Total Emission Data: |
| | - Energy Intensity: |
| | - GHG Intensity: |
| | |
| | Summary: |
| | {summary_text} |
| | """ |
| | return prompt |
| |
|
| | |
| | def extract_sustainability_data(pdf_file): |
| | |
| | raw_text = extract_pdf_text(pdf_file, max_pages=30) |
| |
|
| | |
| | summary_text = summarize_text(raw_text) |
| |
|
| | |
| | prompt = create_prompt(summary_text) |
| |
|
| | response = llm_pipeline(prompt, max_length=256, temperature=0.1, num_beams=3)[0]['generated_text'] |
| |
|
| | |
| | extracted_data = {} |
| | fields = ["Company Name", "Year of Report", "Industry Sector", "Total Emission Data", "Energy Intensity", "GHG Intensity"] |
| | for field in fields: |
| | try: |
| | field_value = response.split(f"{field}:")[1].split("\n")[0].strip() |
| | except IndexError: |
| | field_value = "Not Found" |
| | extracted_data[field] = field_value if field_value else "Not Found" |
| |
|
| | return extracted_data |
| |
|
| | |
| | with gr.Blocks() as demo: |
| | gr.Markdown("# 🌱 Large Sustainability Report Extractor (Summarization + Prompt-guided LLM)") |
| |
|
| | pdf_input = gr.File(label="Upload Sustainability Report (PDF, Large files supported)") |
| | output = gr.JSON(label="Extracted Sustainability Data") |
| |
|
| | btn = gr.Button("Extract Data") |
| | btn.click(fn=extract_sustainability_data, inputs=pdf_input, outputs=output) |
| |
|
| | demo.launch() |