import gradio as gr
import pdfplumber
from transformers import pipeline
import torch

# Check device availability
device = 0 if torch.cuda.is_available() else -1

# Initialize pipelines
summarizer = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",
    device=device
)

llm_pipeline = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device=device
)

# Function to extract text from PDF (optimized for large PDFs)
def extract_pdf_text(pdf_file, max_pages=20):
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        num_pages = min(len(pdf.pages), max_pages)
        for i in range(num_pages):
            page = pdf.pages[i]
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# Summarize large text into manageable length
def summarize_text(text, max_chunk_length=1000):
    sentences = text.split(". ")
    current_chunk = ""
    chunks = []
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_chunk_length:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    summaries = []
    for chunk in chunks:
        summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
        summaries.append(summary)
    
    combined_summary = " ".join(summaries)
    return combined_summary

# Create structured extraction prompt
def create_prompt(summary_text):
    prompt = f"""
Extract clearly the following sustainability information from the provided summary:

- Company Name:
- Year of Report:
- Industry Sector:
- Total Emission Data:
- Energy Intensity:
- GHG Intensity:

Summary:
{summary_text}
"""
    return prompt

# Main extraction function
def extract_sustainability_data(pdf_file):
    # Step 1: extract limited text from PDF to manage resource usage
    raw_text = extract_pdf_text(pdf_file, max_pages=30)  # adjust max_pages as needed

    # Step 2: summarize this extracted text to reduce token length
    summary_text = summarize_text(raw_text)

    # Step 3: LLM Prompt-based extraction on the summarized text
    prompt = create_prompt(summary_text)

    response = llm_pipeline(prompt, max_length=256, temperature=0.1, num_beams=3)[0]['generated_text']

    # Parse structured response carefully
    extracted_data = {}
    fields = ["Company Name", "Year of Report", "Industry Sector", "Total Emission Data", "Energy Intensity", "GHG Intensity"]
    for field in fields:
        try:
            field_value = response.split(f"{field}:")[1].split("\n")[0].strip()
        except IndexError:
            field_value = "Not Found"
        extracted_data[field] = field_value if field_value else "Not Found"

    return extracted_data

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 🌱 Large Sustainability Report Extractor (Summarization + Prompt-guided LLM)")

    pdf_input = gr.File(label="Upload Sustainability Report (PDF, Large files supported)")
    output = gr.JSON(label="Extracted Sustainability Data")

    btn = gr.Button("Extract Data")
    btn.click(fn=extract_sustainability_data, inputs=pdf_input, outputs=output)

demo.launch()