|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
|
import os |
|
|
from dotenv import load_dotenv |
|
|
import PyPDF2 |
|
|
from io import BytesIO |
|
|
from reportlab.lib.pagesizes import letter |
|
|
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer |
|
|
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle |
|
|
import tempfile |
|
|
|
|
|
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from langchain.chains.summarize import load_summarize_chain |
|
|
from langchain.docstore.document import Document |
|
|
from langchain.llms import OpenAI |
|
|
from langchain.chat_models import ChatOpenAI |
|
|
from langchain.llms import HuggingFacePipeline |
|
|
|
|
|
from transformers import pipeline, AutoTokenizer |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
def chunk_text_for_hf(text, tokenizer, max_tokens=1024, overlap=50): |
|
|
"""Split text into chunks compatible with Hugging Face summarizers.""" |
|
|
tokens = tokenizer.encode(text) |
|
|
total_tokens = len(tokens) |
|
|
if total_tokens <= max_tokens: |
|
|
return [text] |
|
|
chunks = [] |
|
|
start = 0 |
|
|
while start < total_tokens: |
|
|
end = min(start + max_tokens, total_tokens) |
|
|
chunk_tokens = tokens[start:end] |
|
|
chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True) |
|
|
chunks.append(chunk_text) |
|
|
start += max_tokens - overlap |
|
|
return chunks |
|
|
|
|
|
def summarize_long_text_hf(text, summarizer, tokenizer, max_tokens=1024, overlap=50, max_length=150, min_length=40): |
|
|
"""Summarize long text by chunking and combining summaries (Hugging Face models).""" |
|
|
text_chunks = chunk_text_for_hf(text, tokenizer, max_tokens, overlap) |
|
|
summaries = [] |
|
|
for chunk in text_chunks: |
|
|
summary = summarizer( |
|
|
chunk, |
|
|
max_length=max_length, |
|
|
min_length=min_length, |
|
|
do_sample=False, |
|
|
truncation=True |
|
|
)[0]['summary_text'] |
|
|
summaries.append(summary) |
|
|
return " ".join(summaries) |
|
|
|
|
|
class ResearchPaperSummarizer: |
|
|
def __init__(self): |
|
|
self.llm = None |
|
|
self.model_info = "" |
|
|
self.hf_tokenizer = None |
|
|
self.hf_summarizer = None |
|
|
self.is_hf_pipeline = False |
|
|
|
|
|
def setup_llm(self, model_choice): |
|
|
"""Setup LLM based on user choice""" |
|
|
openai_api_key = os.getenv("OPENAI_API_KEY") |
|
|
hf_token = os.getenv("HUGGINGFACE_TOKEN") |
|
|
self.is_hf_pipeline = False |
|
|
try: |
|
|
if "OpenAI" in model_choice: |
|
|
if not openai_api_key: |
|
|
return False, "β OpenAI API Key not found in environment variables. Please add OPENAI_API_KEY to your Hugging Face Space settings." |
|
|
os.environ["OPENAI_API_KEY"] = openai_api_key |
|
|
if "GPT-4" in model_choice: |
|
|
self.llm = ChatOpenAI(model_name="gpt-4", temperature=0.3) |
|
|
self.model_info = "π Using GPT-4 (Premium)" |
|
|
else: |
|
|
self.llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3) |
|
|
self.model_info = "π Using GPT-3.5 Turbo" |
|
|
else: |
|
|
self.is_hf_pipeline = True |
|
|
if "BART" in model_choice: |
|
|
model_id = "facebook/bart-large-cnn" |
|
|
else: |
|
|
model_id = "t5-base" |
|
|
self.hf_summarizer = pipeline( |
|
|
"summarization", |
|
|
model=model_id, |
|
|
tokenizer=model_id, |
|
|
use_auth_token=hf_token if hf_token else None |
|
|
) |
|
|
self.hf_tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token if hf_token else None) |
|
|
self.llm = HuggingFacePipeline(pipeline=self.hf_summarizer) |
|
|
self.model_info = f"π€ Using {model_id} model" |
|
|
return True, f"β
Model loaded successfully! {self.model_info}" |
|
|
except Exception as e: |
|
|
return False, f"β Error loading model: {str(e)}" |
|
|
|
|
|
def extract_text_from_pdf(self, pdf_file): |
|
|
"""Extract text from uploaded PDF""" |
|
|
try: |
|
|
if pdf_file is None: |
|
|
return None, "β No PDF file uploaded" |
|
|
pdf_reader = PyPDF2.PdfReader(pdf_file) |
|
|
text = "" |
|
|
for page in pdf_reader.pages: |
|
|
page_text = page.extract_text() |
|
|
if page_text: |
|
|
text += page_text |
|
|
if not text.strip(): |
|
|
return None, "β No text could be extracted from the PDF" |
|
|
return text, f"β
Successfully extracted {len(text):,} characters from PDF" |
|
|
except Exception as e: |
|
|
return None, f"β Error reading PDF: {str(e)}" |
|
|
|
|
|
def create_documents(self, text): |
|
|
"""Split text into manageable chunks for LangChain LLMs""" |
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=4000, |
|
|
chunk_overlap=200, |
|
|
length_function=len |
|
|
) |
|
|
chunks = text_splitter.split_text(text) |
|
|
documents = [Document(page_content=chunk) for chunk in chunks] |
|
|
return documents |
|
|
|
|
|
def generate_summary(self, documents, summary_type="map_reduce", raw_text=None): |
|
|
"""Generate summary using LangChain or robust HF chunking""" |
|
|
try: |
|
|
|
|
|
if self.is_hf_pipeline and raw_text and self.hf_summarizer and self.hf_tokenizer: |
|
|
return summarize_long_text_hf( |
|
|
raw_text, self.hf_summarizer, self.hf_tokenizer, |
|
|
max_tokens=1024, overlap=50, max_length=150, min_length=40 |
|
|
) |
|
|
|
|
|
if summary_type == "map_reduce": |
|
|
chain = load_summarize_chain(self.llm, chain_type="map_reduce", verbose=False) |
|
|
elif summary_type == "stuff": |
|
|
chain = load_summarize_chain(self.llm, chain_type="stuff", verbose=False) |
|
|
else: |
|
|
chain = load_summarize_chain(self.llm, chain_type="refine", verbose=False) |
|
|
summary = chain.run(documents) |
|
|
return summary |
|
|
except Exception as e: |
|
|
return f"β Error generating summary: {str(e)}" |
|
|
|
|
|
def create_structured_summary(self, text, documents): |
|
|
"""Create a structured summary with different sections""" |
|
|
summaries = {} |
|
|
|
|
|
summaries['overall'] = self.generate_summary(documents, "map_reduce", raw_text=text) |
|
|
|
|
|
key_points_text = text[:8000] if len(text) > 8000 else text |
|
|
key_points_prompt = f""" |
|
|
Extract the 5-7 most important key points from this research paper: |
|
|
{key_points_text} |
|
|
""" |
|
|
key_points_docs = [Document(page_content=key_points_prompt)] |
|
|
summaries['key_points'] = self.generate_summary(key_points_docs, "stuff", raw_text=key_points_prompt) |
|
|
return summaries |
|
|
|
|
|
def create_pdf_summary(self, summaries, paper_title="Research Paper Summary"): |
|
|
"""Create PDF with the summary""" |
|
|
buffer = BytesIO() |
|
|
doc = SimpleDocTemplate(buffer, pagesize=letter) |
|
|
styles = getSampleStyleSheet() |
|
|
story = [] |
|
|
|
|
|
title_style = ParagraphStyle( |
|
|
'CustomTitle', |
|
|
parent=styles['Heading1'], |
|
|
fontSize=16, |
|
|
spaceAfter=30, |
|
|
textColor='darkblue' |
|
|
) |
|
|
story.append(Paragraph(paper_title, title_style)) |
|
|
story.append(Spacer(1, 12)) |
|
|
|
|
|
story.append(Paragraph("Overall Summary", styles['Heading2'])) |
|
|
story.append(Spacer(1, 12)) |
|
|
story.append(Paragraph(summaries.get('overall', 'No summary available'), styles['Normal'])) |
|
|
story.append(Spacer(1, 20)) |
|
|
|
|
|
if 'key_points' in summaries: |
|
|
story.append(Paragraph("Key Points", styles['Heading2'])) |
|
|
story.append(Spacer(1, 12)) |
|
|
story.append(Paragraph(summaries['key_points'], styles['Normal'])) |
|
|
doc.build(story) |
|
|
buffer.seek(0) |
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') |
|
|
temp_file.write(buffer.getvalue()) |
|
|
temp_file.close() |
|
|
return temp_file.name |
|
|
|
|
|
|
|
|
summarizer = ResearchPaperSummarizer() |
|
|
|
|
|
def process_paper(pdf_file, model_choice, summary_type, include_key_points, paper_title): |
|
|
"""Main function to process the research paper""" |
|
|
|
|
|
success, message = summarizer.setup_llm(model_choice) |
|
|
if not success: |
|
|
return message, "", "", None |
|
|
status_message = message + "\n\n" |
|
|
|
|
|
text, extract_message = summarizer.extract_text_from_pdf(pdf_file) |
|
|
status_message += extract_message + "\n\n" |
|
|
if text is None: |
|
|
return status_message, "", "", None |
|
|
|
|
|
documents = summarizer.create_documents(text) |
|
|
status_message += f"π Text split into {len(documents)} chunks for processing\n\n" |
|
|
|
|
|
status_message += "π Generating summary... Please wait...\n\n" |
|
|
try: |
|
|
if include_key_points: |
|
|
summaries = summarizer.create_structured_summary(text, documents) |
|
|
overall_summary = summaries.get('overall', 'No summary generated') |
|
|
key_points = summaries.get('key_points', 'No key points generated') |
|
|
else: |
|
|
overall_summary = summarizer.generate_summary(documents, summary_type, raw_text=text) |
|
|
key_points = "Key points not requested" |
|
|
summaries = {'overall': overall_summary} |
|
|
status_message += "π Summary generated successfully!" |
|
|
|
|
|
pdf_file_path = None |
|
|
if paper_title and paper_title.strip(): |
|
|
try: |
|
|
pdf_file_path = summarizer.create_pdf_summary(summaries, paper_title.strip()) |
|
|
status_message += "\nπ PDF summary created!" |
|
|
except Exception as e: |
|
|
status_message += f"\nβ οΈ PDF creation failed: {str(e)}" |
|
|
return status_message, overall_summary, key_points, pdf_file_path |
|
|
except Exception as e: |
|
|
return status_message + f"β Error during processing: {str(e)}", "", "", None |
|
|
|
|
|
def get_model_info(model_choice): |
|
|
"""Return information about the selected model""" |
|
|
model_descriptions = { |
|
|
"OpenAI GPT-3.5": "π‘ **Fast and Efficient** - Good for most tasks, paid API required", |
|
|
"OpenAI GPT-4": "π **Highest Quality** - Most advanced summaries, paid API required", |
|
|
"Hugging Face BART": "π **Free Model** - Optimized for summarization, slower on first load", |
|
|
"Hugging Face T5": "π **Free Versatile** - Good general-purpose model, slower on first load" |
|
|
} |
|
|
return model_descriptions.get(model_choice, "") |
|
|
|
|
|
|
|
|
custom_css = """ |
|
|
.gradio-container { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; |
|
|
} |
|
|
.gr-interface { |
|
|
background: rgba(255, 255, 255, 0.95); |
|
|
backdrop-filter: blur(10px); |
|
|
border-radius: 20px; |
|
|
box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1); |
|
|
} |
|
|
.gr-box { |
|
|
border-radius: 15px; |
|
|
border: 2px solid #e1e5e9; |
|
|
background: linear-gradient(145deg, #ffffff, #f0f2f5); |
|
|
} |
|
|
.gr-button { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
border: none; |
|
|
border-radius: 10px; |
|
|
color: white; |
|
|
font-weight: bold; |
|
|
transition: transform 0.2s; |
|
|
} |
|
|
.gr-button:hover { |
|
|
transform: translateY(-2px); |
|
|
box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4); |
|
|
} |
|
|
.gr-textbox, .gr-dropdown { |
|
|
border-radius: 10px; |
|
|
border: 2px solid #e1e5e9; |
|
|
} |
|
|
.gr-file { |
|
|
border-radius: 15px; |
|
|
border: 3px dashed #667eea; |
|
|
background: linear-gradient(145deg, #f8f9ff, #ffffff); |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
with gr.Blocks(css=custom_css, title="π¬ Research Paper Summarizer", theme=gr.themes.Soft()) as app: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# π¬ Research Paper Summarizer |
|
|
### Transform lengthy research papers into concise, insightful summaries using AI |
|
|
|
|
|
Upload your PDF research paper and get an intelligent summary with key points extracted automatically! |
|
|
""", |
|
|
elem_classes="header" |
|
|
) |
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("## π Upload & Configure") |
|
|
pdf_input = gr.File( |
|
|
label="π Upload Research Paper (PDF)", |
|
|
file_types=[".pdf"], |
|
|
elem_classes="file-upload" |
|
|
) |
|
|
model_choice = gr.Dropdown( |
|
|
choices=[ |
|
|
"OpenAI GPT-3.5", |
|
|
"OpenAI GPT-4", |
|
|
"Hugging Face BART", |
|
|
"Hugging Face T5" |
|
|
], |
|
|
value="Hugging Face BART", |
|
|
label="π€ Choose AI Model", |
|
|
info="Free models work without API keys" |
|
|
) |
|
|
model_info = gr.Markdown("") |
|
|
summary_type = gr.Dropdown( |
|
|
choices=["map_reduce", "stuff", "refine"], |
|
|
value="map_reduce", |
|
|
label="π Summary Method", |
|
|
info="map_reduce: best for long papers | stuff: faster for short papers | refine: iterative improvement" |
|
|
) |
|
|
include_key_points = gr.Checkbox( |
|
|
label="π Include Key Points", |
|
|
value=True, |
|
|
info="Extract important key points separately" |
|
|
) |
|
|
paper_title = gr.Textbox( |
|
|
label="π Paper Title (for PDF export)", |
|
|
placeholder="Enter the title of your research paper...", |
|
|
info="Optional: Used as title in the generated PDF summary" |
|
|
) |
|
|
process_btn = gr.Button( |
|
|
"π Generate Summary", |
|
|
variant="primary", |
|
|
size="lg", |
|
|
elem_classes="process-button" |
|
|
) |
|
|
with gr.Column(scale=2): |
|
|
gr.Markdown("## π Results") |
|
|
status_output = gr.Textbox( |
|
|
label="π Processing Status", |
|
|
lines=8, |
|
|
max_lines=10, |
|
|
interactive=False, |
|
|
show_copy_button=True |
|
|
) |
|
|
summary_output = gr.Textbox( |
|
|
label="π Overall Summary", |
|
|
lines=10, |
|
|
max_lines=15, |
|
|
interactive=False, |
|
|
show_copy_button=True, |
|
|
placeholder="Your paper summary will appear here..." |
|
|
) |
|
|
key_points_output = gr.Textbox( |
|
|
label="π Key Points", |
|
|
lines=8, |
|
|
max_lines=12, |
|
|
interactive=False, |
|
|
show_copy_button=True, |
|
|
placeholder="Key points will be extracted here..." |
|
|
) |
|
|
pdf_output = gr.File( |
|
|
label="π Download PDF Summary", |
|
|
interactive=False |
|
|
) |
|
|
with gr.Accordion("π§ Setup Instructions for API Keys", open=False): |
|
|
gr.Markdown( |
|
|
""" |
|
|
### For Enhanced Performance (Optional): |
|
|
**OpenAI API Setup:** |
|
|
1. Get your API key from [OpenAI Platform](https://platform.openai.com/api-keys) |
|
|
2. In your Hugging Face Space settings, add: `OPENAI_API_KEY = your_key_here` |
|
|
3. Restart your Space to apply changes |
|
|
|
|
|
**Hugging Face Token Setup:** |
|
|
1. Get your token from [HuggingFace Settings](https://huggingface.co/settings/tokens) |
|
|
2. Add: `HUGGINGFACE_TOKEN = your_token_here` |
|
|
3. Provides access to gated models and higher rate limits |
|
|
|
|
|
**Note:** Free Hugging Face models work without any API keys but may be slower on first load. |
|
|
""" |
|
|
) |
|
|
with gr.Accordion("π‘ Tips for Best Results", open=False): |
|
|
gr.Markdown( |
|
|
""" |
|
|
### Optimization Tips: |
|
|
- **π File Size:** Smaller PDFs (< 10MB) process faster |
|
|
- **π€ Model Choice:** OpenAI models provide highest quality but require API keys |
|
|
- **β‘ Speed:** "stuff" method is fastest for papers under 20 pages |
|
|
- **π Quality:** "map_reduce" works best for comprehensive summaries of long papers |
|
|
- **π First Load:** Hugging Face models may take 2-3 minutes to load initially |
|
|
- **π± Mobile:** Works on mobile devices but desktop recommended for large files |
|
|
""" |
|
|
) |
|
|
model_choice.change( |
|
|
fn=get_model_info, |
|
|
inputs=[model_choice], |
|
|
outputs=[model_info] |
|
|
) |
|
|
process_btn.click( |
|
|
fn=process_paper, |
|
|
inputs=[ |
|
|
pdf_input, |
|
|
model_choice, |
|
|
summary_type, |
|
|
include_key_points, |
|
|
paper_title |
|
|
], |
|
|
outputs=[ |
|
|
status_output, |
|
|
summary_output, |
|
|
key_points_output, |
|
|
pdf_output |
|
|
], |
|
|
show_progress=True |
|
|
) |
|
|
gr.Markdown( |
|
|
""" |
|
|
--- |
|
|
<div style="text-align: center; color: #666; font-size: 14px;"> |
|
|
π¬ <strong>Research Paper Summarizer</strong> | Powered by LangChain & AI Models | |
|
|
Built with β€οΈ using Gradio |
|
|
</div> |
|
|
""", |
|
|
elem_classes="footer" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.launch( |
|
|
share=True, |
|
|
show_error=True, |
|
|
debug=True, |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860 |
|
|
) |
|
|
|