Spaces:

koey811
/

test1

Sleeping

App Files Files Community

test1 / app.py

koey811

Create app.py

5f59461 verified 11 months ago

raw

history blame contribute delete

3.35 kB

	import gradio as gr
	import pdfplumber
	from transformers import pipeline
	import torch

	# Check device availability
	device = 0 if torch.cuda.is_available() else -1

	# Initialize pipelines
	summarizer = pipeline(
	"summarization",
	model="sshleifer/distilbart-cnn-12-6",
	device=device
	)

	llm_pipeline = pipeline(
	"text2text-generation",
	model="google/flan-t5-small",
	device=device
	)

	# Function to extract text from PDF (optimized for large PDFs)
	def extract_pdf_text(pdf_file, max_pages=20):
	text = ""
	with pdfplumber.open(pdf_file) as pdf:
	num_pages = min(len(pdf.pages), max_pages)
	for i in range(num_pages):
	page = pdf.pages[i]
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	return text

	# Summarize large text into manageable length
	def summarize_text(text, max_chunk_length=1000):
	sentences = text.split(". ")
	current_chunk = ""
	chunks = []
	for sentence in sentences:
	if len(current_chunk) + len(sentence) < max_chunk_length:
	current_chunk += sentence + ". "
	else:
	chunks.append(current_chunk.strip())
	current_chunk = sentence + ". "
	if current_chunk:
	chunks.append(current_chunk.strip())

	summaries = []
	for chunk in chunks:
	summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
	summaries.append(summary)

	combined_summary = " ".join(summaries)
	return combined_summary

	# Create structured extraction prompt
	def create_prompt(summary_text):
	prompt = f"""
	Extract clearly the following sustainability information from the provided summary:

	- Company Name:
	- Year of Report:
	- Industry Sector:
	- Total Emission Data:
	- Energy Intensity:
	- GHG Intensity:

	Summary:
	{summary_text}
	"""
	return prompt

	# Main extraction function
	def extract_sustainability_data(pdf_file):
	# Step 1: extract limited text from PDF to manage resource usage
	raw_text = extract_pdf_text(pdf_file, max_pages=30) # adjust max_pages as needed

	# Step 2: summarize this extracted text to reduce token length
	summary_text = summarize_text(raw_text)

	# Step 3: LLM Prompt-based extraction on the summarized text
	prompt = create_prompt(summary_text)

	response = llm_pipeline(prompt, max_length=256, temperature=0.1, num_beams=3)[0]['generated_text']

	# Parse structured response carefully
	extracted_data = {}
	fields = ["Company Name", "Year of Report", "Industry Sector", "Total Emission Data", "Energy Intensity", "GHG Intensity"]
	for field in fields:
	try:
	field_value = response.split(f"{field}:")[1].split("\n")[0].strip()
	except IndexError:
	field_value = "Not Found"
	extracted_data[field] = field_value if field_value else "Not Found"

	return extracted_data

	# Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("# 🌱 Large Sustainability Report Extractor (Summarization + Prompt-guided LLM)")

	pdf_input = gr.File(label="Upload Sustainability Report (PDF, Large files supported)")
	output = gr.JSON(label="Extracted Sustainability Data")

	btn = gr.Button("Extract Data")
	btn.click(fn=extract_sustainability_data, inputs=pdf_input, outputs=output)

	demo.launch()