# Research Paper Summarizer using LangChain and Gradio # Hugging Face Spaces ready – robust chunking for large PDFs import gradio as gr import os from dotenv import load_dotenv import PyPDF2 from io import BytesIO from reportlab.lib.pagesizes import letter from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle import tempfile # LangChain imports from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chains.summarize import load_summarize_chain from langchain.docstore.document import Document from langchain.llms import OpenAI from langchain.chat_models import ChatOpenAI from langchain.llms import HuggingFacePipeline from transformers import pipeline, AutoTokenizer load_dotenv() # --- Helper for robust chunking --- def chunk_text_for_hf(text, tokenizer, max_tokens=1024, overlap=50): """Split text into chunks compatible with Hugging Face summarizers.""" tokens = tokenizer.encode(text) total_tokens = len(tokens) if total_tokens <= max_tokens: return [text] chunks = [] start = 0 while start < total_tokens: end = min(start + max_tokens, total_tokens) chunk_tokens = tokens[start:end] chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True) chunks.append(chunk_text) start += max_tokens - overlap return chunks def summarize_long_text_hf(text, summarizer, tokenizer, max_tokens=1024, overlap=50, max_length=150, min_length=40): """Summarize long text by chunking and combining summaries (Hugging Face models).""" text_chunks = chunk_text_for_hf(text, tokenizer, max_tokens, overlap) summaries = [] for chunk in text_chunks: summary = summarizer( chunk, max_length=max_length, min_length=min_length, do_sample=False, truncation=True )[0]['summary_text'] summaries.append(summary) return " ".join(summaries) class ResearchPaperSummarizer: def __init__(self): self.llm = None self.model_info = "" self.hf_tokenizer = None self.hf_summarizer = None self.is_hf_pipeline = False def setup_llm(self, model_choice): """Setup LLM based on user choice""" openai_api_key = os.getenv("OPENAI_API_KEY") hf_token = os.getenv("HUGGINGFACE_TOKEN") self.is_hf_pipeline = False try: if "OpenAI" in model_choice: if not openai_api_key: return False, "āŒ OpenAI API Key not found in environment variables. Please add OPENAI_API_KEY to your Hugging Face Space settings." os.environ["OPENAI_API_KEY"] = openai_api_key if "GPT-4" in model_choice: self.llm = ChatOpenAI(model_name="gpt-4", temperature=0.3) self.model_info = "šŸš€ Using GPT-4 (Premium)" else: self.llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3) self.model_info = "šŸš€ Using GPT-3.5 Turbo" else: self.is_hf_pipeline = True if "BART" in model_choice: model_id = "facebook/bart-large-cnn" else: model_id = "t5-base" self.hf_summarizer = pipeline( "summarization", model=model_id, tokenizer=model_id, use_auth_token=hf_token if hf_token else None ) self.hf_tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token if hf_token else None) self.llm = HuggingFacePipeline(pipeline=self.hf_summarizer) self.model_info = f"šŸ¤— Using {model_id} model" return True, f"āœ… Model loaded successfully! {self.model_info}" except Exception as e: return False, f"āŒ Error loading model: {str(e)}" def extract_text_from_pdf(self, pdf_file): """Extract text from uploaded PDF""" try: if pdf_file is None: return None, "āŒ No PDF file uploaded" pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page in pdf_reader.pages: page_text = page.extract_text() if page_text: text += page_text if not text.strip(): return None, "āŒ No text could be extracted from the PDF" return text, f"āœ… Successfully extracted {len(text):,} characters from PDF" except Exception as e: return None, f"āŒ Error reading PDF: {str(e)}" def create_documents(self, text): """Split text into manageable chunks for LangChain LLMs""" text_splitter = RecursiveCharacterTextSplitter( chunk_size=4000, chunk_overlap=200, length_function=len ) chunks = text_splitter.split_text(text) documents = [Document(page_content=chunk) for chunk in chunks] return documents def generate_summary(self, documents, summary_type="map_reduce", raw_text=None): """Generate summary using LangChain or robust HF chunking""" try: # For Hugging Face models, use robust chunking if self.is_hf_pipeline and raw_text and self.hf_summarizer and self.hf_tokenizer: return summarize_long_text_hf( raw_text, self.hf_summarizer, self.hf_tokenizer, max_tokens=1024, overlap=50, max_length=150, min_length=40 ) # For OpenAI or other models, use LangChain summarization chain if summary_type == "map_reduce": chain = load_summarize_chain(self.llm, chain_type="map_reduce", verbose=False) elif summary_type == "stuff": chain = load_summarize_chain(self.llm, chain_type="stuff", verbose=False) else: chain = load_summarize_chain(self.llm, chain_type="refine", verbose=False) summary = chain.run(documents) return summary except Exception as e: return f"āŒ Error generating summary: {str(e)}" def create_structured_summary(self, text, documents): """Create a structured summary with different sections""" summaries = {} # Overall Summary summaries['overall'] = self.generate_summary(documents, "map_reduce", raw_text=text) # Key Points - Use first 8000 chars for key points key_points_text = text[:8000] if len(text) > 8000 else text key_points_prompt = f""" Extract the 5-7 most important key points from this research paper: {key_points_text} """ key_points_docs = [Document(page_content=key_points_prompt)] summaries['key_points'] = self.generate_summary(key_points_docs, "stuff", raw_text=key_points_prompt) return summaries def create_pdf_summary(self, summaries, paper_title="Research Paper Summary"): """Create PDF with the summary""" buffer = BytesIO() doc = SimpleDocTemplate(buffer, pagesize=letter) styles = getSampleStyleSheet() story = [] # Title title_style = ParagraphStyle( 'CustomTitle', parent=styles['Heading1'], fontSize=16, spaceAfter=30, textColor='darkblue' ) story.append(Paragraph(paper_title, title_style)) story.append(Spacer(1, 12)) # Overall Summary story.append(Paragraph("Overall Summary", styles['Heading2'])) story.append(Spacer(1, 12)) story.append(Paragraph(summaries.get('overall', 'No summary available'), styles['Normal'])) story.append(Spacer(1, 20)) # Key Points if 'key_points' in summaries: story.append(Paragraph("Key Points", styles['Heading2'])) story.append(Spacer(1, 12)) story.append(Paragraph(summaries['key_points'], styles['Normal'])) doc.build(story) buffer.seek(0) temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') temp_file.write(buffer.getvalue()) temp_file.close() return temp_file.name # Initialize the summarizer summarizer = ResearchPaperSummarizer() def process_paper(pdf_file, model_choice, summary_type, include_key_points, paper_title): """Main function to process the research paper""" # Setup model success, message = summarizer.setup_llm(model_choice) if not success: return message, "", "", None status_message = message + "\n\n" # Extract text from PDF text, extract_message = summarizer.extract_text_from_pdf(pdf_file) status_message += extract_message + "\n\n" if text is None: return status_message, "", "", None # Create documents documents = summarizer.create_documents(text) status_message += f"šŸ“ Text split into {len(documents)} chunks for processing\n\n" # Generate summary status_message += "šŸ”„ Generating summary... Please wait...\n\n" try: if include_key_points: summaries = summarizer.create_structured_summary(text, documents) overall_summary = summaries.get('overall', 'No summary generated') key_points = summaries.get('key_points', 'No key points generated') else: overall_summary = summarizer.generate_summary(documents, summary_type, raw_text=text) key_points = "Key points not requested" summaries = {'overall': overall_summary} status_message += "šŸŽ‰ Summary generated successfully!" # Generate PDF if title is provided pdf_file_path = None if paper_title and paper_title.strip(): try: pdf_file_path = summarizer.create_pdf_summary(summaries, paper_title.strip()) status_message += "\nšŸ“„ PDF summary created!" except Exception as e: status_message += f"\nāš ļø PDF creation failed: {str(e)}" return status_message, overall_summary, key_points, pdf_file_path except Exception as e: return status_message + f"āŒ Error during processing: {str(e)}", "", "", None def get_model_info(model_choice): """Return information about the selected model""" model_descriptions = { "OpenAI GPT-3.5": "šŸ’” **Fast and Efficient** - Good for most tasks, paid API required", "OpenAI GPT-4": "šŸš€ **Highest Quality** - Most advanced summaries, paid API required", "Hugging Face BART": "šŸ†“ **Free Model** - Optimized for summarization, slower on first load", "Hugging Face T5": "šŸ†“ **Free Versatile** - Good general-purpose model, slower on first load" } return model_descriptions.get(model_choice, "") # Custom CSS for beautiful styling custom_css = """ .gradio-container { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; } .gr-interface { background: rgba(255, 255, 255, 0.95); backdrop-filter: blur(10px); border-radius: 20px; box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1); } .gr-box { border-radius: 15px; border: 2px solid #e1e5e9; background: linear-gradient(145deg, #ffffff, #f0f2f5); } .gr-button { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border: none; border-radius: 10px; color: white; font-weight: bold; transition: transform 0.2s; } .gr-button:hover { transform: translateY(-2px); box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4); } .gr-textbox, .gr-dropdown { border-radius: 10px; border: 2px solid #e1e5e9; } .gr-file { border-radius: 15px; border: 3px dashed #667eea; background: linear-gradient(145deg, #f8f9ff, #ffffff); } """ # Create the Gradio interface with gr.Blocks(css=custom_css, title="šŸ”¬ Research Paper Summarizer", theme=gr.themes.Soft()) as app: gr.Markdown( """ # šŸ”¬ Research Paper Summarizer ### Transform lengthy research papers into concise, insightful summaries using AI Upload your PDF research paper and get an intelligent summary with key points extracted automatically! """, elem_classes="header" ) with gr.Row(): with gr.Column(scale=1): gr.Markdown("## šŸ“ Upload & Configure") pdf_input = gr.File( label="šŸ“„ Upload Research Paper (PDF)", file_types=[".pdf"], elem_classes="file-upload" ) model_choice = gr.Dropdown( choices=[ "OpenAI GPT-3.5", "OpenAI GPT-4", "Hugging Face BART", "Hugging Face T5" ], value="Hugging Face BART", label="šŸ¤– Choose AI Model", info="Free models work without API keys" ) model_info = gr.Markdown("") summary_type = gr.Dropdown( choices=["map_reduce", "stuff", "refine"], value="map_reduce", label="šŸ“‹ Summary Method", info="map_reduce: best for long papers | stuff: faster for short papers | refine: iterative improvement" ) include_key_points = gr.Checkbox( label="šŸ”‘ Include Key Points", value=True, info="Extract important key points separately" ) paper_title = gr.Textbox( label="šŸ“ Paper Title (for PDF export)", placeholder="Enter the title of your research paper...", info="Optional: Used as title in the generated PDF summary" ) process_btn = gr.Button( "šŸš€ Generate Summary", variant="primary", size="lg", elem_classes="process-button" ) with gr.Column(scale=2): gr.Markdown("## šŸ“Š Results") status_output = gr.Textbox( label="šŸ“ˆ Processing Status", lines=8, max_lines=10, interactive=False, show_copy_button=True ) summary_output = gr.Textbox( label="šŸ“‹ Overall Summary", lines=10, max_lines=15, interactive=False, show_copy_button=True, placeholder="Your paper summary will appear here..." ) key_points_output = gr.Textbox( label="šŸ”‘ Key Points", lines=8, max_lines=12, interactive=False, show_copy_button=True, placeholder="Key points will be extracted here..." ) pdf_output = gr.File( label="šŸ“„ Download PDF Summary", interactive=False ) with gr.Accordion("šŸ”§ Setup Instructions for API Keys", open=False): gr.Markdown( """ ### For Enhanced Performance (Optional): **OpenAI API Setup:** 1. Get your API key from [OpenAI Platform](https://platform.openai.com/api-keys) 2. In your Hugging Face Space settings, add: `OPENAI_API_KEY = your_key_here` 3. Restart your Space to apply changes **Hugging Face Token Setup:** 1. Get your token from [HuggingFace Settings](https://huggingface.co/settings/tokens) 2. Add: `HUGGINGFACE_TOKEN = your_token_here` 3. Provides access to gated models and higher rate limits **Note:** Free Hugging Face models work without any API keys but may be slower on first load. """ ) with gr.Accordion("šŸ’” Tips for Best Results", open=False): gr.Markdown( """ ### Optimization Tips: - **šŸ“„ File Size:** Smaller PDFs (< 10MB) process faster - **šŸ¤– Model Choice:** OpenAI models provide highest quality but require API keys - **⚔ Speed:** "stuff" method is fastest for papers under 20 pages - **šŸ“Š Quality:** "map_reduce" works best for comprehensive summaries of long papers - **šŸ”„ First Load:** Hugging Face models may take 2-3 minutes to load initially - **šŸ“± Mobile:** Works on mobile devices but desktop recommended for large files """ ) model_choice.change( fn=get_model_info, inputs=[model_choice], outputs=[model_info] ) process_btn.click( fn=process_paper, inputs=[ pdf_input, model_choice, summary_type, include_key_points, paper_title ], outputs=[ status_output, summary_output, key_points_output, pdf_output ], show_progress=True ) gr.Markdown( """ ---
šŸ”¬ Research Paper Summarizer | Powered by LangChain & AI Models | Built with ā¤ļø using Gradio
""", elem_classes="footer" ) if __name__ == "__main__": app.launch( share=True, show_error=True, debug=True, server_name="0.0.0.0", server_port=7860 )