Spaces:

Sazzz02
/

RP_Sum

Sleeping

App Files Files Community

RP_Sum / app.py

Sazzz02

Update app.py

09d367e verified 8 months ago

raw

history blame contribute delete

17.9 kB

	# Research Paper Summarizer using LangChain and Gradio
	# Hugging Face Spaces ready – robust chunking for large PDFs

	import gradio as gr
	import os
	from dotenv import load_dotenv
	import PyPDF2
	from io import BytesIO
	from reportlab.lib.pagesizes import letter
	from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	import tempfile

	# LangChain imports
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.chains.summarize import load_summarize_chain
	from langchain.docstore.document import Document
	from langchain.llms import OpenAI
	from langchain.chat_models import ChatOpenAI
	from langchain.llms import HuggingFacePipeline

	from transformers import pipeline, AutoTokenizer

	load_dotenv()

	# --- Helper for robust chunking ---
	def chunk_text_for_hf(text, tokenizer, max_tokens=1024, overlap=50):
	"""Split text into chunks compatible with Hugging Face summarizers."""
	tokens = tokenizer.encode(text)
	total_tokens = len(tokens)
	if total_tokens <= max_tokens:
	return [text]
	chunks = []
	start = 0
	while start < total_tokens:
	end = min(start + max_tokens, total_tokens)
	chunk_tokens = tokens[start:end]
	chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
	chunks.append(chunk_text)
	start += max_tokens - overlap
	return chunks

	def summarize_long_text_hf(text, summarizer, tokenizer, max_tokens=1024, overlap=50, max_length=150, min_length=40):
	"""Summarize long text by chunking and combining summaries (Hugging Face models)."""
	text_chunks = chunk_text_for_hf(text, tokenizer, max_tokens, overlap)
	summaries = []
	for chunk in text_chunks:
	summary = summarizer(
	chunk,
	max_length=max_length,
	min_length=min_length,
	do_sample=False,
	truncation=True
	)[0]['summary_text']
	summaries.append(summary)
	return " ".join(summaries)

	class ResearchPaperSummarizer:
	def __init__(self):
	self.llm = None
	self.model_info = ""
	self.hf_tokenizer = None
	self.hf_summarizer = None
	self.is_hf_pipeline = False

	def setup_llm(self, model_choice):
	"""Setup LLM based on user choice"""
	openai_api_key = os.getenv("OPENAI_API_KEY")
	hf_token = os.getenv("HUGGINGFACE_TOKEN")
	self.is_hf_pipeline = False
	try:
	if "OpenAI" in model_choice:
	if not openai_api_key:
	return False, "❌ OpenAI API Key not found in environment variables. Please add OPENAI_API_KEY to your Hugging Face Space settings."
	os.environ["OPENAI_API_KEY"] = openai_api_key
	if "GPT-4" in model_choice:
	self.llm = ChatOpenAI(model_name="gpt-4", temperature=0.3)
	self.model_info = "🚀 Using GPT-4 (Premium)"
	else:
	self.llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3)
	self.model_info = "🚀 Using GPT-3.5 Turbo"
	else:
	self.is_hf_pipeline = True
	if "BART" in model_choice:
	model_id = "facebook/bart-large-cnn"
	else:
	model_id = "t5-base"
	self.hf_summarizer = pipeline(
	"summarization",
	model=model_id,
	tokenizer=model_id,
	use_auth_token=hf_token if hf_token else None
	)
	self.hf_tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token if hf_token else None)
	self.llm = HuggingFacePipeline(pipeline=self.hf_summarizer)
	self.model_info = f"🤗 Using {model_id} model"
	return True, f"✅ Model loaded successfully! {self.model_info}"
	except Exception as e:
	return False, f"❌ Error loading model: {str(e)}"

	def extract_text_from_pdf(self, pdf_file):
	"""Extract text from uploaded PDF"""
	try:
	if pdf_file is None:
	return None, "❌ No PDF file uploaded"
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	text = ""
	for page in pdf_reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text
	if not text.strip():
	return None, "❌ No text could be extracted from the PDF"
	return text, f"✅ Successfully extracted {len(text):,} characters from PDF"
	except Exception as e:
	return None, f"❌ Error reading PDF: {str(e)}"

	def create_documents(self, text):
	"""Split text into manageable chunks for LangChain LLMs"""
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=4000,
	chunk_overlap=200,
	length_function=len
	)
	chunks = text_splitter.split_text(text)
	documents = [Document(page_content=chunk) for chunk in chunks]
	return documents

	def generate_summary(self, documents, summary_type="map_reduce", raw_text=None):
	"""Generate summary using LangChain or robust HF chunking"""
	try:
	# For Hugging Face models, use robust chunking
	if self.is_hf_pipeline and raw_text and self.hf_summarizer and self.hf_tokenizer:
	return summarize_long_text_hf(
	raw_text, self.hf_summarizer, self.hf_tokenizer,
	max_tokens=1024, overlap=50, max_length=150, min_length=40
	)
	# For OpenAI or other models, use LangChain summarization chain
	if summary_type == "map_reduce":
	chain = load_summarize_chain(self.llm, chain_type="map_reduce", verbose=False)
	elif summary_type == "stuff":
	chain = load_summarize_chain(self.llm, chain_type="stuff", verbose=False)
	else:
	chain = load_summarize_chain(self.llm, chain_type="refine", verbose=False)
	summary = chain.run(documents)
	return summary
	except Exception as e:
	return f"❌ Error generating summary: {str(e)}"

	def create_structured_summary(self, text, documents):
	"""Create a structured summary with different sections"""
	summaries = {}
	# Overall Summary
	summaries['overall'] = self.generate_summary(documents, "map_reduce", raw_text=text)
	# Key Points - Use first 8000 chars for key points
	key_points_text = text[:8000] if len(text) > 8000 else text
	key_points_prompt = f"""
	Extract the 5-7 most important key points from this research paper:
	{key_points_text}
	"""
	key_points_docs = [Document(page_content=key_points_prompt)]
	summaries['key_points'] = self.generate_summary(key_points_docs, "stuff", raw_text=key_points_prompt)
	return summaries

	def create_pdf_summary(self, summaries, paper_title="Research Paper Summary"):
	"""Create PDF with the summary"""
	buffer = BytesIO()
	doc = SimpleDocTemplate(buffer, pagesize=letter)
	styles = getSampleStyleSheet()
	story = []
	# Title
	title_style = ParagraphStyle(
	'CustomTitle',
	parent=styles['Heading1'],
	fontSize=16,
	spaceAfter=30,
	textColor='darkblue'
	)
	story.append(Paragraph(paper_title, title_style))
	story.append(Spacer(1, 12))
	# Overall Summary
	story.append(Paragraph("Overall Summary", styles['Heading2']))
	story.append(Spacer(1, 12))
	story.append(Paragraph(summaries.get('overall', 'No summary available'), styles['Normal']))
	story.append(Spacer(1, 20))
	# Key Points
	if 'key_points' in summaries:
	story.append(Paragraph("Key Points", styles['Heading2']))
	story.append(Spacer(1, 12))
	story.append(Paragraph(summaries['key_points'], styles['Normal']))
	doc.build(story)
	buffer.seek(0)
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
	temp_file.write(buffer.getvalue())
	temp_file.close()
	return temp_file.name

	# Initialize the summarizer
	summarizer = ResearchPaperSummarizer()

	def process_paper(pdf_file, model_choice, summary_type, include_key_points, paper_title):
	"""Main function to process the research paper"""
	# Setup model
	success, message = summarizer.setup_llm(model_choice)
	if not success:
	return message, "", "", None
	status_message = message + "\n\n"
	# Extract text from PDF
	text, extract_message = summarizer.extract_text_from_pdf(pdf_file)
	status_message += extract_message + "\n\n"
	if text is None:
	return status_message, "", "", None
	# Create documents
	documents = summarizer.create_documents(text)
	status_message += f"📝 Text split into {len(documents)} chunks for processing\n\n"
	# Generate summary
	status_message += "🔄 Generating summary... Please wait...\n\n"
	try:
	if include_key_points:
	summaries = summarizer.create_structured_summary(text, documents)
	overall_summary = summaries.get('overall', 'No summary generated')
	key_points = summaries.get('key_points', 'No key points generated')
	else:
	overall_summary = summarizer.generate_summary(documents, summary_type, raw_text=text)
	key_points = "Key points not requested"
	summaries = {'overall': overall_summary}
	status_message += "🎉 Summary generated successfully!"
	# Generate PDF if title is provided
	pdf_file_path = None
	if paper_title and paper_title.strip():
	try:
	pdf_file_path = summarizer.create_pdf_summary(summaries, paper_title.strip())
	status_message += "\n📄 PDF summary created!"
	except Exception as e:
	status_message += f"\n⚠️ PDF creation failed: {str(e)}"
	return status_message, overall_summary, key_points, pdf_file_path
	except Exception as e:
	return status_message + f"❌ Error during processing: {str(e)}", "", "", None

	def get_model_info(model_choice):
	"""Return information about the selected model"""
	model_descriptions = {
	"OpenAI GPT-3.5": "💡 Fast and Efficient - Good for most tasks, paid API required",
	"OpenAI GPT-4": "🚀 Highest Quality - Most advanced summaries, paid API required",
	"Hugging Face BART": "🆓 Free Model - Optimized for summarization, slower on first load",
	"Hugging Face T5": "🆓 Free Versatile - Good general-purpose model, slower on first load"
	}
	return model_descriptions.get(model_choice, "")

	# Custom CSS for beautiful styling
	custom_css = """
	.gradio-container {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
	}
	.gr-interface {
	background: rgba(255, 255, 255, 0.95);
	backdrop-filter: blur(10px);
	border-radius: 20px;
	box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1);
	}
	.gr-box {
	border-radius: 15px;
	border: 2px solid #e1e5e9;
	background: linear-gradient(145deg, #ffffff, #f0f2f5);
	}
	.gr-button {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	border: none;
	border-radius: 10px;
	color: white;
	font-weight: bold;
	transition: transform 0.2s;
	}
	.gr-button:hover {
	transform: translateY(-2px);
	box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
	}
	.gr-textbox, .gr-dropdown {
	border-radius: 10px;
	border: 2px solid #e1e5e9;
	}
	.gr-file {
	border-radius: 15px;
	border: 3px dashed #667eea;
	background: linear-gradient(145deg, #f8f9ff, #ffffff);
	}
	"""

	# Create the Gradio interface
	with gr.Blocks(css=custom_css, title="🔬 Research Paper Summarizer", theme=gr.themes.Soft()) as app:
	gr.Markdown(
	"""
	# 🔬 Research Paper Summarizer
	### Transform lengthy research papers into concise, insightful summaries using AI

	Upload your PDF research paper and get an intelligent summary with key points extracted automatically!
	""",
	elem_classes="header"
	)
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("## 📁 Upload & Configure")
	pdf_input = gr.File(
	label="📄 Upload Research Paper (PDF)",
	file_types=[".pdf"],
	elem_classes="file-upload"
	)
	model_choice = gr.Dropdown(
	choices=[
	"OpenAI GPT-3.5",
	"OpenAI GPT-4",
	"Hugging Face BART",
	"Hugging Face T5"
	],
	value="Hugging Face BART",
	label="🤖 Choose AI Model",
	info="Free models work without API keys"
	)
	model_info = gr.Markdown("")
	summary_type = gr.Dropdown(
	choices=["map_reduce", "stuff", "refine"],
	value="map_reduce",
	label="📋 Summary Method",
	info="map_reduce: best for long papers \| stuff: faster for short papers \| refine: iterative improvement"
	)
	include_key_points = gr.Checkbox(
	label="🔑 Include Key Points",
	value=True,
	info="Extract important key points separately"
	)
	paper_title = gr.Textbox(
	label="📝 Paper Title (for PDF export)",
	placeholder="Enter the title of your research paper...",
	info="Optional: Used as title in the generated PDF summary"
	)
	process_btn = gr.Button(
	"🚀 Generate Summary",
	variant="primary",
	size="lg",
	elem_classes="process-button"
	)
	with gr.Column(scale=2):
	gr.Markdown("## 📊 Results")
	status_output = gr.Textbox(
	label="📈 Processing Status",
	lines=8,
	max_lines=10,
	interactive=False,
	show_copy_button=True
	)
	summary_output = gr.Textbox(
	label="📋 Overall Summary",
	lines=10,
	max_lines=15,
	interactive=False,
	show_copy_button=True,
	placeholder="Your paper summary will appear here..."
	)
	key_points_output = gr.Textbox(
	label="🔑 Key Points",
	lines=8,
	max_lines=12,
	interactive=False,
	show_copy_button=True,
	placeholder="Key points will be extracted here..."
	)
	pdf_output = gr.File(
	label="📄 Download PDF Summary",
	interactive=False
	)
	with gr.Accordion("🔧 Setup Instructions for API Keys", open=False):
	gr.Markdown(
	"""
	### For Enhanced Performance (Optional):
	OpenAI API Setup:
	1. Get your API key from [OpenAI Platform](https://platform.openai.com/api-keys)
	2. In your Hugging Face Space settings, add: `OPENAI_API_KEY = your_key_here`
	3. Restart your Space to apply changes

	Hugging Face Token Setup:
	1. Get your token from [HuggingFace Settings](https://huggingface.co/settings/tokens)
	2. Add: `HUGGINGFACE_TOKEN = your_token_here`
	3. Provides access to gated models and higher rate limits

	Note: Free Hugging Face models work without any API keys but may be slower on first load.
	"""
	)
	with gr.Accordion("💡 Tips for Best Results", open=False):
	gr.Markdown(
	"""
	### Optimization Tips:
	- 📄 File Size: Smaller PDFs (< 10MB) process faster
	- 🤖 Model Choice: OpenAI models provide highest quality but require API keys
	- ⚡ Speed: "stuff" method is fastest for papers under 20 pages
	- 📊 Quality: "map_reduce" works best for comprehensive summaries of long papers
	- 🔄 First Load: Hugging Face models may take 2-3 minutes to load initially
	- 📱 Mobile: Works on mobile devices but desktop recommended for large files
	"""
	)
	model_choice.change(
	fn=get_model_info,
	inputs=[model_choice],
	outputs=[model_info]
	)
	process_btn.click(
	fn=process_paper,
	inputs=[
	pdf_input,
	model_choice,
	summary_type,
	include_key_points,
	paper_title
	],
	outputs=[
	status_output,
	summary_output,
	key_points_output,
	pdf_output
	],
	show_progress=True
	)
	gr.Markdown(
	"""
	---
	<div style="text-align: center; color: #666; font-size: 14px;">
	🔬 <strong>Research Paper Summarizer</strong> \| Powered by LangChain & AI Models \|
	Built with ❤️ using Gradio
	</div>
	""",
	elem_classes="footer"
	)

	if __name__ == "__main__":
	app.launch(
	share=True,
	show_error=True,
	debug=True,
	server_name="0.0.0.0",
	server_port=7860
	)