koey811 commited on
Commit
5f59461
·
verified ·
1 Parent(s): 1155187

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -0
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pdfplumber
3
+ from transformers import pipeline
4
+ import torch
5
+
6
+ # Check device availability
7
+ device = 0 if torch.cuda.is_available() else -1
8
+
9
+ # Initialize pipelines
10
+ summarizer = pipeline(
11
+ "summarization",
12
+ model="sshleifer/distilbart-cnn-12-6",
13
+ device=device
14
+ )
15
+
16
+ llm_pipeline = pipeline(
17
+ "text2text-generation",
18
+ model="google/flan-t5-small",
19
+ device=device
20
+ )
21
+
22
+ # Function to extract text from PDF (optimized for large PDFs)
23
+ def extract_pdf_text(pdf_file, max_pages=20):
24
+ text = ""
25
+ with pdfplumber.open(pdf_file) as pdf:
26
+ num_pages = min(len(pdf.pages), max_pages)
27
+ for i in range(num_pages):
28
+ page = pdf.pages[i]
29
+ page_text = page.extract_text()
30
+ if page_text:
31
+ text += page_text + "\n"
32
+ return text
33
+
34
+ # Summarize large text into manageable length
35
+ def summarize_text(text, max_chunk_length=1000):
36
+ sentences = text.split(". ")
37
+ current_chunk = ""
38
+ chunks = []
39
+ for sentence in sentences:
40
+ if len(current_chunk) + len(sentence) < max_chunk_length:
41
+ current_chunk += sentence + ". "
42
+ else:
43
+ chunks.append(current_chunk.strip())
44
+ current_chunk = sentence + ". "
45
+ if current_chunk:
46
+ chunks.append(current_chunk.strip())
47
+
48
+ summaries = []
49
+ for chunk in chunks:
50
+ summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
51
+ summaries.append(summary)
52
+
53
+ combined_summary = " ".join(summaries)
54
+ return combined_summary
55
+
56
+ # Create structured extraction prompt
57
+ def create_prompt(summary_text):
58
+ prompt = f"""
59
+ Extract clearly the following sustainability information from the provided summary:
60
+
61
+ - Company Name:
62
+ - Year of Report:
63
+ - Industry Sector:
64
+ - Total Emission Data:
65
+ - Energy Intensity:
66
+ - GHG Intensity:
67
+
68
+ Summary:
69
+ {summary_text}
70
+ """
71
+ return prompt
72
+
73
+ # Main extraction function
74
+ def extract_sustainability_data(pdf_file):
75
+ # Step 1: extract limited text from PDF to manage resource usage
76
+ raw_text = extract_pdf_text(pdf_file, max_pages=30) # adjust max_pages as needed
77
+
78
+ # Step 2: summarize this extracted text to reduce token length
79
+ summary_text = summarize_text(raw_text)
80
+
81
+ # Step 3: LLM Prompt-based extraction on the summarized text
82
+ prompt = create_prompt(summary_text)
83
+
84
+ response = llm_pipeline(prompt, max_length=256, temperature=0.1, num_beams=3)[0]['generated_text']
85
+
86
+ # Parse structured response carefully
87
+ extracted_data = {}
88
+ fields = ["Company Name", "Year of Report", "Industry Sector", "Total Emission Data", "Energy Intensity", "GHG Intensity"]
89
+ for field in fields:
90
+ try:
91
+ field_value = response.split(f"{field}:")[1].split("\n")[0].strip()
92
+ except IndexError:
93
+ field_value = "Not Found"
94
+ extracted_data[field] = field_value if field_value else "Not Found"
95
+
96
+ return extracted_data
97
+
98
+ # Gradio UI
99
+ with gr.Blocks() as demo:
100
+ gr.Markdown("# 🌱 Large Sustainability Report Extractor (Summarization + Prompt-guided LLM)")
101
+
102
+ pdf_input = gr.File(label="Upload Sustainability Report (PDF, Large files supported)")
103
+ output = gr.JSON(label="Extracted Sustainability Data")
104
+
105
+ btn = gr.Button("Extract Data")
106
+ btn.click(fn=extract_sustainability_data, inputs=pdf_input, outputs=output)
107
+
108
+ demo.launch()