Spaces:

koey811
/

test1

Build error

App Files Files Community

koey811 commited on Mar 24, 2025

Commit

5f59461

verified ·

1 Parent(s): 1155187

Create app.py

Browse files

Files changed (1) hide show

app.py +108 -0

app.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import gradio as gr
+import pdfplumber
+from transformers import pipeline
+import torch
+# Check device availability
+device = 0 if torch.cuda.is_available() else -1
+# Initialize pipelines
+summarizer = pipeline(
+    "summarization",
+    model="sshleifer/distilbart-cnn-12-6",
+    device=device
+)
+llm_pipeline = pipeline(
+    "text2text-generation",
+    model="google/flan-t5-small",
+    device=device
+)
+# Function to extract text from PDF (optimized for large PDFs)
+def extract_pdf_text(pdf_file, max_pages=20):
+    text = ""
+    with pdfplumber.open(pdf_file) as pdf:
+        num_pages = min(len(pdf.pages), max_pages)
+        for i in range(num_pages):
+            page = pdf.pages[i]
+            page_text = page.extract_text()
+            if page_text:
+                text += page_text + "\n"
+    return text
+# Summarize large text into manageable length
+def summarize_text(text, max_chunk_length=1000):
+    sentences = text.split(". ")
+    current_chunk = ""
+    chunks = []
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) < max_chunk_length:
+            current_chunk += sentence + ". "
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence + ". "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    summaries = []
+    for chunk in chunks:
+        summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
+        summaries.append(summary)
+    combined_summary = " ".join(summaries)
+    return combined_summary
+# Create structured extraction prompt
+def create_prompt(summary_text):
+    prompt = f"""
+Extract clearly the following sustainability information from the provided summary:
+- Company Name:
+- Year of Report:
+- Industry Sector:
+- Total Emission Data:
+- Energy Intensity:
+- GHG Intensity:
+Summary:
+{summary_text}
+"""
+    return prompt
+# Main extraction function
+def extract_sustainability_data(pdf_file):
+    # Step 1: extract limited text from PDF to manage resource usage
+    raw_text = extract_pdf_text(pdf_file, max_pages=30)  # adjust max_pages as needed
+    # Step 2: summarize this extracted text to reduce token length
+    summary_text = summarize_text(raw_text)
+    # Step 3: LLM Prompt-based extraction on the summarized text
+    prompt = create_prompt(summary_text)
+    response = llm_pipeline(prompt, max_length=256, temperature=0.1, num_beams=3)[0]['generated_text']
+    # Parse structured response carefully
+    extracted_data = {}
+    fields = ["Company Name", "Year of Report", "Industry Sector", "Total Emission Data", "Energy Intensity", "GHG Intensity"]
+    for field in fields:
+        try:
+            field_value = response.split(f"{field}:")[1].split("\n")[0].strip()
+        except IndexError:
+            field_value = "Not Found"
+        extracted_data[field] = field_value if field_value else "Not Found"
+    return extracted_data
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# 🌱 Large Sustainability Report Extractor (Summarization + Prompt-guided LLM)")
+    pdf_input = gr.File(label="Upload Sustainability Report (PDF, Large files supported)")
+    output = gr.JSON(label="Extracted Sustainability Data")
+    btn = gr.Button("Extract Data")
+    btn.click(fn=extract_sustainability_data, inputs=pdf_input, outputs=output)
+demo.launch()