# Research Paper Summarizer using LangChain and Gradio
# Hugging Face Spaces ready – robust chunking for large PDFs

import gradio as gr
import os
from dotenv import load_dotenv
import PyPDF2
from io import BytesIO
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
import tempfile

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.llms import HuggingFacePipeline

from transformers import pipeline, AutoTokenizer

load_dotenv()

# --- Helper for robust chunking ---
def chunk_text_for_hf(text, tokenizer, max_tokens=1024, overlap=50):
    """Split text into chunks compatible with Hugging Face summarizers."""
    tokens = tokenizer.encode(text)
    total_tokens = len(tokens)
    if total_tokens <= max_tokens:
        return [text]
    chunks = []
    start = 0
    while start < total_tokens:
        end = min(start + max_tokens, total_tokens)
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)
        start += max_tokens - overlap
    return chunks

def summarize_long_text_hf(text, summarizer, tokenizer, max_tokens=1024, overlap=50, max_length=150, min_length=40):
    """Summarize long text by chunking and combining summaries (Hugging Face models)."""
    text_chunks = chunk_text_for_hf(text, tokenizer, max_tokens, overlap)
    summaries = []
    for chunk in text_chunks:
        summary = summarizer(
            chunk,
            max_length=max_length,
            min_length=min_length,
            do_sample=False,
            truncation=True
        )[0]['summary_text']
        summaries.append(summary)
    return " ".join(summaries)

class ResearchPaperSummarizer:
    def __init__(self):
        self.llm = None
        self.model_info = ""
        self.hf_tokenizer = None
        self.hf_summarizer = None
        self.is_hf_pipeline = False

    def setup_llm(self, model_choice):
        """Setup LLM based on user choice"""
        openai_api_key = os.getenv("OPENAI_API_KEY")
        hf_token = os.getenv("HUGGINGFACE_TOKEN")
        self.is_hf_pipeline = False
        try:
            if "OpenAI" in model_choice:
                if not openai_api_key:
                    return False, "❌ OpenAI API Key not found in environment variables. Please add OPENAI_API_KEY to your Hugging Face Space settings."
                os.environ["OPENAI_API_KEY"] = openai_api_key
                if "GPT-4" in model_choice:
                    self.llm = ChatOpenAI(model_name="gpt-4", temperature=0.3)
                    self.model_info = "🚀 Using GPT-4 (Premium)"
                else:
                    self.llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3)
                    self.model_info = "🚀 Using GPT-3.5 Turbo"
            else:
                self.is_hf_pipeline = True
                if "BART" in model_choice:
                    model_id = "facebook/bart-large-cnn"
                else:
                    model_id = "t5-base"
                self.hf_summarizer = pipeline(
                    "summarization",
                    model=model_id,
                    tokenizer=model_id,
                    use_auth_token=hf_token if hf_token else None
                )
                self.hf_tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token if hf_token else None)
                self.llm = HuggingFacePipeline(pipeline=self.hf_summarizer)
                self.model_info = f"🤗 Using {model_id} model"
            return True, f"✅ Model loaded successfully! {self.model_info}"
        except Exception as e:
            return False, f"❌ Error loading model: {str(e)}"

    def extract_text_from_pdf(self, pdf_file):
        """Extract text from uploaded PDF"""
        try:
            if pdf_file is None:
                return None, "❌ No PDF file uploaded"
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in pdf_reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
            if not text.strip():
                return None, "❌ No text could be extracted from the PDF"
            return text, f"✅ Successfully extracted {len(text):,} characters from PDF"
        except Exception as e:
            return None, f"❌ Error reading PDF: {str(e)}"

    def create_documents(self, text):
        """Split text into manageable chunks for LangChain LLMs"""
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=4000,
            chunk_overlap=200,
            length_function=len
        )
        chunks = text_splitter.split_text(text)
        documents = [Document(page_content=chunk) for chunk in chunks]
        return documents

    def generate_summary(self, documents, summary_type="map_reduce", raw_text=None):
        """Generate summary using LangChain or robust HF chunking"""
        try:
            # For Hugging Face models, use robust chunking
            if self.is_hf_pipeline and raw_text and self.hf_summarizer and self.hf_tokenizer:
                return summarize_long_text_hf(
                    raw_text, self.hf_summarizer, self.hf_tokenizer,
                    max_tokens=1024, overlap=50, max_length=150, min_length=40
                )
            # For OpenAI or other models, use LangChain summarization chain
            if summary_type == "map_reduce":
                chain = load_summarize_chain(self.llm, chain_type="map_reduce", verbose=False)
            elif summary_type == "stuff":
                chain = load_summarize_chain(self.llm, chain_type="stuff", verbose=False)
            else:
                chain = load_summarize_chain(self.llm, chain_type="refine", verbose=False)
            summary = chain.run(documents)
            return summary
        except Exception as e:
            return f"❌ Error generating summary: {str(e)}"

    def create_structured_summary(self, text, documents):
        """Create a structured summary with different sections"""
        summaries = {}
        # Overall Summary
        summaries['overall'] = self.generate_summary(documents, "map_reduce", raw_text=text)
        # Key Points - Use first 8000 chars for key points
        key_points_text = text[:8000] if len(text) > 8000 else text
        key_points_prompt = f"""
        Extract the 5-7 most important key points from this research paper:
        {key_points_text}
        """
        key_points_docs = [Document(page_content=key_points_prompt)]
        summaries['key_points'] = self.generate_summary(key_points_docs, "stuff", raw_text=key_points_prompt)
        return summaries

    def create_pdf_summary(self, summaries, paper_title="Research Paper Summary"):
        """Create PDF with the summary"""
        buffer = BytesIO()
        doc = SimpleDocTemplate(buffer, pagesize=letter)
        styles = getSampleStyleSheet()
        story = []
        # Title
        title_style = ParagraphStyle(
            'CustomTitle',
            parent=styles['Heading1'],
            fontSize=16,
            spaceAfter=30,
            textColor='darkblue'
        )
        story.append(Paragraph(paper_title, title_style))
        story.append(Spacer(1, 12))
        # Overall Summary
        story.append(Paragraph("Overall Summary", styles['Heading2']))
        story.append(Spacer(1, 12))
        story.append(Paragraph(summaries.get('overall', 'No summary available'), styles['Normal']))
        story.append(Spacer(1, 20))
        # Key Points
        if 'key_points' in summaries:
            story.append(Paragraph("Key Points", styles['Heading2']))
            story.append(Spacer(1, 12))
            story.append(Paragraph(summaries['key_points'], styles['Normal']))
        doc.build(story)
        buffer.seek(0)
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
        temp_file.write(buffer.getvalue())
        temp_file.close()
        return temp_file.name

# Initialize the summarizer
summarizer = ResearchPaperSummarizer()

def process_paper(pdf_file, model_choice, summary_type, include_key_points, paper_title):
    """Main function to process the research paper"""
    # Setup model
    success, message = summarizer.setup_llm(model_choice)
    if not success:
        return message, "", "", None
    status_message = message + "\n\n"
    # Extract text from PDF
    text, extract_message = summarizer.extract_text_from_pdf(pdf_file)
    status_message += extract_message + "\n\n"
    if text is None:
        return status_message, "", "", None
    # Create documents
    documents = summarizer.create_documents(text)
    status_message += f"📝 Text split into {len(documents)} chunks for processing\n\n"
    # Generate summary
    status_message += "🔄 Generating summary... Please wait...\n\n"
    try:
        if include_key_points:
            summaries = summarizer.create_structured_summary(text, documents)
            overall_summary = summaries.get('overall', 'No summary generated')
            key_points = summaries.get('key_points', 'No key points generated')
        else:
            overall_summary = summarizer.generate_summary(documents, summary_type, raw_text=text)
            key_points = "Key points not requested"
            summaries = {'overall': overall_summary}
        status_message += "🎉 Summary generated successfully!"
        # Generate PDF if title is provided
        pdf_file_path = None
        if paper_title and paper_title.strip():
            try:
                pdf_file_path = summarizer.create_pdf_summary(summaries, paper_title.strip())
                status_message += "\n📄 PDF summary created!"
            except Exception as e:
                status_message += f"\n⚠️ PDF creation failed: {str(e)}"
        return status_message, overall_summary, key_points, pdf_file_path
    except Exception as e:
        return status_message + f"❌ Error during processing: {str(e)}", "", "", None

def get_model_info(model_choice):
    """Return information about the selected model"""
    model_descriptions = {
        "OpenAI GPT-3.5": "💡 **Fast and Efficient** - Good for most tasks, paid API required",
        "OpenAI GPT-4": "🚀 **Highest Quality** - Most advanced summaries, paid API required",
        "Hugging Face BART": "🆓 **Free Model** - Optimized for summarization, slower on first load",
        "Hugging Face T5": "🆓 **Free Versatile** - Good general-purpose model, slower on first load"
    }
    return model_descriptions.get(model_choice, "")

# Custom CSS for beautiful styling
custom_css = """
    .gradio-container {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    }
    .gr-interface {
        background: rgba(255, 255, 255, 0.95);
        backdrop-filter: blur(10px);
        border-radius: 20px;
        box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1);
    }
    .gr-box {
        border-radius: 15px;
        border: 2px solid #e1e5e9;
        background: linear-gradient(145deg, #ffffff, #f0f2f5);
    }
    .gr-button {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        border: none;
        border-radius: 10px;
        color: white;
        font-weight: bold;
        transition: transform 0.2s;
    }
    .gr-button:hover {
        transform: translateY(-2px);
        box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
    }
    .gr-textbox, .gr-dropdown {
        border-radius: 10px;
        border: 2px solid #e1e5e9;
    }
    .gr-file {
        border-radius: 15px;
        border: 3px dashed #667eea;
        background: linear-gradient(145deg, #f8f9ff, #ffffff);
    }
"""

# Create the Gradio interface
with gr.Blocks(css=custom_css, title="🔬 Research Paper Summarizer", theme=gr.themes.Soft()) as app:
    gr.Markdown(
        """
        # 🔬 Research Paper Summarizer
        ### Transform lengthy research papers into concise, insightful summaries using AI
        
        Upload your PDF research paper and get an intelligent summary with key points extracted automatically!
        """,
        elem_classes="header"
    )
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## 📁 Upload & Configure")
            pdf_input = gr.File(
                label="📄 Upload Research Paper (PDF)",
                file_types=[".pdf"],
                elem_classes="file-upload"
            )
            model_choice = gr.Dropdown(
                choices=[
                    "OpenAI GPT-3.5",
                    "OpenAI GPT-4",
                    "Hugging Face BART",
                    "Hugging Face T5"
                ],
                value="Hugging Face BART",
                label="🤖 Choose AI Model",
                info="Free models work without API keys"
            )
            model_info = gr.Markdown("")
            summary_type = gr.Dropdown(
                choices=["map_reduce", "stuff", "refine"],
                value="map_reduce",
                label="📋 Summary Method",
                info="map_reduce: best for long papers | stuff: faster for short papers | refine: iterative improvement"
            )
            include_key_points = gr.Checkbox(
                label="🔑 Include Key Points",
                value=True,
                info="Extract important key points separately"
            )
            paper_title = gr.Textbox(
                label="📝 Paper Title (for PDF export)",
                placeholder="Enter the title of your research paper...",
                info="Optional: Used as title in the generated PDF summary"
            )
            process_btn = gr.Button(
                "🚀 Generate Summary",
                variant="primary",
                size="lg",
                elem_classes="process-button"
            )
        with gr.Column(scale=2):
            gr.Markdown("## 📊 Results")
            status_output = gr.Textbox(
                label="📈 Processing Status",
                lines=8,
                max_lines=10,
                interactive=False,
                show_copy_button=True
            )
            summary_output = gr.Textbox(
                label="📋 Overall Summary",
                lines=10,
                max_lines=15,
                interactive=False,
                show_copy_button=True,
                placeholder="Your paper summary will appear here..."
            )
            key_points_output = gr.Textbox(
                label="🔑 Key Points",
                lines=8,
                max_lines=12,
                interactive=False,
                show_copy_button=True,
                placeholder="Key points will be extracted here..."
            )
            pdf_output = gr.File(
                label="📄 Download PDF Summary",
                interactive=False
            )
    with gr.Accordion("🔧 Setup Instructions for API Keys", open=False):
        gr.Markdown(
            """
            ### For Enhanced Performance (Optional):
            **OpenAI API Setup:**
            1. Get your API key from [OpenAI Platform](https://platform.openai.com/api-keys)
            2. In your Hugging Face Space settings, add: `OPENAI_API_KEY = your_key_here`
            3. Restart your Space to apply changes

            **Hugging Face Token Setup:**
            1. Get your token from [HuggingFace Settings](https://huggingface.co/settings/tokens)
            2. Add: `HUGGINGFACE_TOKEN = your_token_here`
            3. Provides access to gated models and higher rate limits

            **Note:** Free Hugging Face models work without any API keys but may be slower on first load.
            """
        )
    with gr.Accordion("💡 Tips for Best Results", open=False):
        gr.Markdown(
            """
            ### Optimization Tips:
            - **📄 File Size:** Smaller PDFs (< 10MB) process faster
            - **🤖 Model Choice:** OpenAI models provide highest quality but require API keys
            - **⚡ Speed:** "stuff" method is fastest for papers under 20 pages
            - **📊 Quality:** "map_reduce" works best for comprehensive summaries of long papers
            - **🔄 First Load:** Hugging Face models may take 2-3 minutes to load initially
            - **📱 Mobile:** Works on mobile devices but desktop recommended for large files
            """
        )
    model_choice.change(
        fn=get_model_info,
        inputs=[model_choice],
        outputs=[model_info]
    )
    process_btn.click(
        fn=process_paper,
        inputs=[
            pdf_input,
            model_choice,
            summary_type,
            include_key_points,
            paper_title
        ],
        outputs=[
            status_output,
            summary_output,
            key_points_output,
            pdf_output
        ],
        show_progress=True
    )
    gr.Markdown(
        """
        ---
        <div style="text-align: center; color: #666; font-size: 14px;">
            🔬 <strong>Research Paper Summarizer</strong> | Powered by LangChain & AI Models | 
            Built with ❤️ using Gradio
        </div>
        """,
        elem_classes="footer"
    )

if __name__ == "__main__":
    app.launch(
        share=True,
        show_error=True,
        debug=True,
        server_name="0.0.0.0",
        server_port=7860
    )