RP_Sum / app.py
Sazzz02's picture
Update app.py
09d367e verified
# Research Paper Summarizer using LangChain and Gradio
# Hugging Face Spaces ready – robust chunking for large PDFs
import gradio as gr
import os
from dotenv import load_dotenv
import PyPDF2
from io import BytesIO
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
import tempfile
# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer
load_dotenv()
# --- Helper for robust chunking ---
def chunk_text_for_hf(text, tokenizer, max_tokens=1024, overlap=50):
"""Split text into chunks compatible with Hugging Face summarizers."""
tokens = tokenizer.encode(text)
total_tokens = len(tokens)
if total_tokens <= max_tokens:
return [text]
chunks = []
start = 0
while start < total_tokens:
end = min(start + max_tokens, total_tokens)
chunk_tokens = tokens[start:end]
chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
chunks.append(chunk_text)
start += max_tokens - overlap
return chunks
def summarize_long_text_hf(text, summarizer, tokenizer, max_tokens=1024, overlap=50, max_length=150, min_length=40):
"""Summarize long text by chunking and combining summaries (Hugging Face models)."""
text_chunks = chunk_text_for_hf(text, tokenizer, max_tokens, overlap)
summaries = []
for chunk in text_chunks:
summary = summarizer(
chunk,
max_length=max_length,
min_length=min_length,
do_sample=False,
truncation=True
)[0]['summary_text']
summaries.append(summary)
return " ".join(summaries)
class ResearchPaperSummarizer:
def __init__(self):
self.llm = None
self.model_info = ""
self.hf_tokenizer = None
self.hf_summarizer = None
self.is_hf_pipeline = False
def setup_llm(self, model_choice):
"""Setup LLM based on user choice"""
openai_api_key = os.getenv("OPENAI_API_KEY")
hf_token = os.getenv("HUGGINGFACE_TOKEN")
self.is_hf_pipeline = False
try:
if "OpenAI" in model_choice:
if not openai_api_key:
return False, "❌ OpenAI API Key not found in environment variables. Please add OPENAI_API_KEY to your Hugging Face Space settings."
os.environ["OPENAI_API_KEY"] = openai_api_key
if "GPT-4" in model_choice:
self.llm = ChatOpenAI(model_name="gpt-4", temperature=0.3)
self.model_info = "πŸš€ Using GPT-4 (Premium)"
else:
self.llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3)
self.model_info = "πŸš€ Using GPT-3.5 Turbo"
else:
self.is_hf_pipeline = True
if "BART" in model_choice:
model_id = "facebook/bart-large-cnn"
else:
model_id = "t5-base"
self.hf_summarizer = pipeline(
"summarization",
model=model_id,
tokenizer=model_id,
use_auth_token=hf_token if hf_token else None
)
self.hf_tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token if hf_token else None)
self.llm = HuggingFacePipeline(pipeline=self.hf_summarizer)
self.model_info = f"πŸ€— Using {model_id} model"
return True, f"βœ… Model loaded successfully! {self.model_info}"
except Exception as e:
return False, f"❌ Error loading model: {str(e)}"
def extract_text_from_pdf(self, pdf_file):
"""Extract text from uploaded PDF"""
try:
if pdf_file is None:
return None, "❌ No PDF file uploaded"
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text
if not text.strip():
return None, "❌ No text could be extracted from the PDF"
return text, f"βœ… Successfully extracted {len(text):,} characters from PDF"
except Exception as e:
return None, f"❌ Error reading PDF: {str(e)}"
def create_documents(self, text):
"""Split text into manageable chunks for LangChain LLMs"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=4000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text)
documents = [Document(page_content=chunk) for chunk in chunks]
return documents
def generate_summary(self, documents, summary_type="map_reduce", raw_text=None):
"""Generate summary using LangChain or robust HF chunking"""
try:
# For Hugging Face models, use robust chunking
if self.is_hf_pipeline and raw_text and self.hf_summarizer and self.hf_tokenizer:
return summarize_long_text_hf(
raw_text, self.hf_summarizer, self.hf_tokenizer,
max_tokens=1024, overlap=50, max_length=150, min_length=40
)
# For OpenAI or other models, use LangChain summarization chain
if summary_type == "map_reduce":
chain = load_summarize_chain(self.llm, chain_type="map_reduce", verbose=False)
elif summary_type == "stuff":
chain = load_summarize_chain(self.llm, chain_type="stuff", verbose=False)
else:
chain = load_summarize_chain(self.llm, chain_type="refine", verbose=False)
summary = chain.run(documents)
return summary
except Exception as e:
return f"❌ Error generating summary: {str(e)}"
def create_structured_summary(self, text, documents):
"""Create a structured summary with different sections"""
summaries = {}
# Overall Summary
summaries['overall'] = self.generate_summary(documents, "map_reduce", raw_text=text)
# Key Points - Use first 8000 chars for key points
key_points_text = text[:8000] if len(text) > 8000 else text
key_points_prompt = f"""
Extract the 5-7 most important key points from this research paper:
{key_points_text}
"""
key_points_docs = [Document(page_content=key_points_prompt)]
summaries['key_points'] = self.generate_summary(key_points_docs, "stuff", raw_text=key_points_prompt)
return summaries
def create_pdf_summary(self, summaries, paper_title="Research Paper Summary"):
"""Create PDF with the summary"""
buffer = BytesIO()
doc = SimpleDocTemplate(buffer, pagesize=letter)
styles = getSampleStyleSheet()
story = []
# Title
title_style = ParagraphStyle(
'CustomTitle',
parent=styles['Heading1'],
fontSize=16,
spaceAfter=30,
textColor='darkblue'
)
story.append(Paragraph(paper_title, title_style))
story.append(Spacer(1, 12))
# Overall Summary
story.append(Paragraph("Overall Summary", styles['Heading2']))
story.append(Spacer(1, 12))
story.append(Paragraph(summaries.get('overall', 'No summary available'), styles['Normal']))
story.append(Spacer(1, 20))
# Key Points
if 'key_points' in summaries:
story.append(Paragraph("Key Points", styles['Heading2']))
story.append(Spacer(1, 12))
story.append(Paragraph(summaries['key_points'], styles['Normal']))
doc.build(story)
buffer.seek(0)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
temp_file.write(buffer.getvalue())
temp_file.close()
return temp_file.name
# Initialize the summarizer
summarizer = ResearchPaperSummarizer()
def process_paper(pdf_file, model_choice, summary_type, include_key_points, paper_title):
"""Main function to process the research paper"""
# Setup model
success, message = summarizer.setup_llm(model_choice)
if not success:
return message, "", "", None
status_message = message + "\n\n"
# Extract text from PDF
text, extract_message = summarizer.extract_text_from_pdf(pdf_file)
status_message += extract_message + "\n\n"
if text is None:
return status_message, "", "", None
# Create documents
documents = summarizer.create_documents(text)
status_message += f"πŸ“ Text split into {len(documents)} chunks for processing\n\n"
# Generate summary
status_message += "πŸ”„ Generating summary... Please wait...\n\n"
try:
if include_key_points:
summaries = summarizer.create_structured_summary(text, documents)
overall_summary = summaries.get('overall', 'No summary generated')
key_points = summaries.get('key_points', 'No key points generated')
else:
overall_summary = summarizer.generate_summary(documents, summary_type, raw_text=text)
key_points = "Key points not requested"
summaries = {'overall': overall_summary}
status_message += "πŸŽ‰ Summary generated successfully!"
# Generate PDF if title is provided
pdf_file_path = None
if paper_title and paper_title.strip():
try:
pdf_file_path = summarizer.create_pdf_summary(summaries, paper_title.strip())
status_message += "\nπŸ“„ PDF summary created!"
except Exception as e:
status_message += f"\n⚠️ PDF creation failed: {str(e)}"
return status_message, overall_summary, key_points, pdf_file_path
except Exception as e:
return status_message + f"❌ Error during processing: {str(e)}", "", "", None
def get_model_info(model_choice):
"""Return information about the selected model"""
model_descriptions = {
"OpenAI GPT-3.5": "πŸ’‘ **Fast and Efficient** - Good for most tasks, paid API required",
"OpenAI GPT-4": "πŸš€ **Highest Quality** - Most advanced summaries, paid API required",
"Hugging Face BART": "πŸ†“ **Free Model** - Optimized for summarization, slower on first load",
"Hugging Face T5": "πŸ†“ **Free Versatile** - Good general-purpose model, slower on first load"
}
return model_descriptions.get(model_choice, "")
# Custom CSS for beautiful styling
custom_css = """
.gradio-container {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}
.gr-interface {
background: rgba(255, 255, 255, 0.95);
backdrop-filter: blur(10px);
border-radius: 20px;
box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1);
}
.gr-box {
border-radius: 15px;
border: 2px solid #e1e5e9;
background: linear-gradient(145deg, #ffffff, #f0f2f5);
}
.gr-button {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
border: none;
border-radius: 10px;
color: white;
font-weight: bold;
transition: transform 0.2s;
}
.gr-button:hover {
transform: translateY(-2px);
box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
}
.gr-textbox, .gr-dropdown {
border-radius: 10px;
border: 2px solid #e1e5e9;
}
.gr-file {
border-radius: 15px;
border: 3px dashed #667eea;
background: linear-gradient(145deg, #f8f9ff, #ffffff);
}
"""
# Create the Gradio interface
with gr.Blocks(css=custom_css, title="πŸ”¬ Research Paper Summarizer", theme=gr.themes.Soft()) as app:
gr.Markdown(
"""
# πŸ”¬ Research Paper Summarizer
### Transform lengthy research papers into concise, insightful summaries using AI
Upload your PDF research paper and get an intelligent summary with key points extracted automatically!
""",
elem_classes="header"
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## πŸ“ Upload & Configure")
pdf_input = gr.File(
label="πŸ“„ Upload Research Paper (PDF)",
file_types=[".pdf"],
elem_classes="file-upload"
)
model_choice = gr.Dropdown(
choices=[
"OpenAI GPT-3.5",
"OpenAI GPT-4",
"Hugging Face BART",
"Hugging Face T5"
],
value="Hugging Face BART",
label="πŸ€– Choose AI Model",
info="Free models work without API keys"
)
model_info = gr.Markdown("")
summary_type = gr.Dropdown(
choices=["map_reduce", "stuff", "refine"],
value="map_reduce",
label="πŸ“‹ Summary Method",
info="map_reduce: best for long papers | stuff: faster for short papers | refine: iterative improvement"
)
include_key_points = gr.Checkbox(
label="πŸ”‘ Include Key Points",
value=True,
info="Extract important key points separately"
)
paper_title = gr.Textbox(
label="πŸ“ Paper Title (for PDF export)",
placeholder="Enter the title of your research paper...",
info="Optional: Used as title in the generated PDF summary"
)
process_btn = gr.Button(
"πŸš€ Generate Summary",
variant="primary",
size="lg",
elem_classes="process-button"
)
with gr.Column(scale=2):
gr.Markdown("## πŸ“Š Results")
status_output = gr.Textbox(
label="πŸ“ˆ Processing Status",
lines=8,
max_lines=10,
interactive=False,
show_copy_button=True
)
summary_output = gr.Textbox(
label="πŸ“‹ Overall Summary",
lines=10,
max_lines=15,
interactive=False,
show_copy_button=True,
placeholder="Your paper summary will appear here..."
)
key_points_output = gr.Textbox(
label="πŸ”‘ Key Points",
lines=8,
max_lines=12,
interactive=False,
show_copy_button=True,
placeholder="Key points will be extracted here..."
)
pdf_output = gr.File(
label="πŸ“„ Download PDF Summary",
interactive=False
)
with gr.Accordion("πŸ”§ Setup Instructions for API Keys", open=False):
gr.Markdown(
"""
### For Enhanced Performance (Optional):
**OpenAI API Setup:**
1. Get your API key from [OpenAI Platform](https://platform.openai.com/api-keys)
2. In your Hugging Face Space settings, add: `OPENAI_API_KEY = your_key_here`
3. Restart your Space to apply changes
**Hugging Face Token Setup:**
1. Get your token from [HuggingFace Settings](https://huggingface.co/settings/tokens)
2. Add: `HUGGINGFACE_TOKEN = your_token_here`
3. Provides access to gated models and higher rate limits
**Note:** Free Hugging Face models work without any API keys but may be slower on first load.
"""
)
with gr.Accordion("πŸ’‘ Tips for Best Results", open=False):
gr.Markdown(
"""
### Optimization Tips:
- **πŸ“„ File Size:** Smaller PDFs (< 10MB) process faster
- **πŸ€– Model Choice:** OpenAI models provide highest quality but require API keys
- **⚑ Speed:** "stuff" method is fastest for papers under 20 pages
- **πŸ“Š Quality:** "map_reduce" works best for comprehensive summaries of long papers
- **πŸ”„ First Load:** Hugging Face models may take 2-3 minutes to load initially
- **πŸ“± Mobile:** Works on mobile devices but desktop recommended for large files
"""
)
model_choice.change(
fn=get_model_info,
inputs=[model_choice],
outputs=[model_info]
)
process_btn.click(
fn=process_paper,
inputs=[
pdf_input,
model_choice,
summary_type,
include_key_points,
paper_title
],
outputs=[
status_output,
summary_output,
key_points_output,
pdf_output
],
show_progress=True
)
gr.Markdown(
"""
---
<div style="text-align: center; color: #666; font-size: 14px;">
πŸ”¬ <strong>Research Paper Summarizer</strong> | Powered by LangChain & AI Models |
Built with ❀️ using Gradio
</div>
""",
elem_classes="footer"
)
if __name__ == "__main__":
app.launch(
share=True,
show_error=True,
debug=True,
server_name="0.0.0.0",
server_port=7860
)