|
|
|
|
|
import streamlit as st |
|
|
from PyPDF2 import PdfReader |
|
|
from transformers import pipeline, AutoTokenizer |
|
|
from pdf2image import convert_from_bytes |
|
|
import pytesseract |
|
|
import torch |
|
|
import re |
|
|
|
|
|
|
|
|
ABSTRACT_MODEL = "sshleifer/distilbart-cnn-12-6" |
|
|
TITLE_MODEL = "linydub/bart-large-samsum" |
|
|
MAX_FILE_SIZE_MB = 10 |
|
|
TESSERACT_PATH = r'C:\Program Files\Tesseract-OCR\tesseract.exe' |
|
|
|
|
|
|
|
|
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH |
|
|
|
|
|
@st.cache_resource |
|
|
def load_models(): |
|
|
"""Load and cache models with proper tokenizers""" |
|
|
with st.spinner('๐ Loading AI models (first time 2-5 mins)...'): |
|
|
|
|
|
abs_tokenizer = AutoTokenizer.from_pretrained(ABSTRACT_MODEL) |
|
|
abstractive = pipeline( |
|
|
"summarization", |
|
|
model=ABSTRACT_MODEL, |
|
|
tokenizer=abs_tokenizer, |
|
|
device=0 if torch.cuda.is_available() else -1 |
|
|
) |
|
|
|
|
|
|
|
|
title_tokenizer = AutoTokenizer.from_pretrained(TITLE_MODEL) |
|
|
title_pipe = pipeline( |
|
|
"text2text-generation", |
|
|
model=TITLE_MODEL, |
|
|
tokenizer=title_tokenizer, |
|
|
max_length=60 |
|
|
) |
|
|
|
|
|
return abstractive, title_pipe, abs_tokenizer, title_tokenizer |
|
|
|
|
|
def extract_text(pdf_file): |
|
|
"""Handle both text and image-based PDFs""" |
|
|
try: |
|
|
|
|
|
reader = PdfReader(pdf_file) |
|
|
text = " ".join([page.extract_text() or "" for page in reader.pages]) |
|
|
|
|
|
|
|
|
if not text.strip(): |
|
|
images = convert_from_bytes(pdf_file.getvalue()) |
|
|
text = " ".join([pytesseract.image_to_string(img) for img in images]) |
|
|
|
|
|
return clean_text(text) |
|
|
except Exception as e: |
|
|
st.error(f"PDF Error: {str(e)}") |
|
|
return "" |
|
|
|
|
|
def clean_text(text): |
|
|
"""Remove headers/footers/section numbers""" |
|
|
patterns = [ |
|
|
r'\n\s*(\d+)\s*\n', |
|
|
r'Proceedings of .*?\n', |
|
|
r'arXiv:\d+\.\d+v\d+.*?\n', |
|
|
r'ยฉ\d{4}.*?\n', |
|
|
r'http\S+', |
|
|
r'\b(?:Figure|Table)\s+\d+' |
|
|
] |
|
|
|
|
|
for pattern in patterns: |
|
|
text = re.sub(pattern, '', text, flags=re.IGNORECASE) |
|
|
|
|
|
return text.strip() |
|
|
|
|
|
def generate_title(abstract, title_pipe): |
|
|
"""Generate a concise and meaningful research paper title (4-5 words).""" |
|
|
prompt = f"Generate a short, research-style title (4-5 words) for this abstract: {abstract}" |
|
|
|
|
|
title = title_pipe( |
|
|
prompt, |
|
|
num_beams=5, |
|
|
early_stopping=True, |
|
|
max_length=10, |
|
|
do_sample=False |
|
|
)[0]['generated_text'].strip() |
|
|
|
|
|
|
|
|
title = title.replace("<pad>", "").replace("</s>", "").strip() |
|
|
|
|
|
|
|
|
words = title.split() |
|
|
if len(words) > 5: |
|
|
title = " ".join(words[:5]) |
|
|
|
|
|
return title |
|
|
|
|
|
def main(): |
|
|
|
|
|
st.markdown("<h1 style='text-align: center;'>RESEARCH PAPER TITLE AND ABSTRACT GENERATION</h1>", |
|
|
unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
col1, col2 = st.columns([4, 1]) |
|
|
with col1: |
|
|
uploaded_file = st.file_uploader("Upload here", type=["pdf"], label_visibility="collapsed") |
|
|
with col2: |
|
|
generate_btn = st.button("ENTER", use_container_width=True) |
|
|
|
|
|
if generate_btn and uploaded_file: |
|
|
if uploaded_file.size > MAX_FILE_SIZE_MB * 1024 * 1024: |
|
|
st.error(f"File too large! Max {MAX_FILE_SIZE_MB}MB allowed") |
|
|
return |
|
|
|
|
|
raw_text = extract_text(uploaded_file) |
|
|
if not raw_text.strip(): |
|
|
st.warning("No text extracted - document might be corrupted") |
|
|
return |
|
|
|
|
|
abstract_pipe, title_pipe, abs_tokenizer, title_tokenizer = load_models() |
|
|
|
|
|
with st.status("Processing...", expanded=True) as status: |
|
|
try: |
|
|
|
|
|
st.write("๐ Analyzing document...") |
|
|
clean_abstract_text = raw_text[:2000] |
|
|
|
|
|
st.write("โ๏ธ Generating abstract...") |
|
|
abstract = abstract_pipe( |
|
|
clean_abstract_text, |
|
|
max_length=150, |
|
|
min_length=50, |
|
|
do_sample=False |
|
|
)[0]['summary_text'] |
|
|
|
|
|
st.write("๐๏ธ Creating title...") |
|
|
title = generate_title(abstract, title_pipe) |
|
|
|
|
|
status.update(label="Complete!", state="complete", expanded=False) |
|
|
|
|
|
|
|
|
st.markdown(f""" |
|
|
<div style='margin-top: 30px;'> |
|
|
<p style='font-size: 14px; font-weight: bold;'>TITLE</p> |
|
|
<p style='font-size: 14px; margin-bottom: 20px;'>{title}</p> |
|
|
<p style='font-size: 12px; font-weight: bold;'>ABSTRACT</p> |
|
|
<p style='font-size: 12px;'>{abstract}</p> |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"Processing failed: {str(e)}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|