Spaces:
Sleeping
Sleeping
| # app.py | |
| import streamlit as st | |
| from PyPDF2 import PdfReader | |
| from transformers import pipeline, AutoTokenizer | |
| from pdf2image import convert_from_bytes | |
| import pytesseract | |
| import torch | |
| import re | |
| # Configuration | |
| ABSTRACT_MODEL = "sshleifer/distilbart-cnn-12-6" | |
| TITLE_MODEL = "linydub/bart-large-samsum" | |
| MAX_FILE_SIZE_MB = 10 | |
| TESSERACT_PATH = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Update this path! | |
| # Set Tesseract path | |
| pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH | |
| def load_models(): | |
| """Load and cache models with proper tokenizers""" | |
| with st.spinner('🚀 Loading AI models (first time 2-5 mins)...'): | |
| # Abstract model | |
| abs_tokenizer = AutoTokenizer.from_pretrained(ABSTRACT_MODEL) | |
| abstractive = pipeline( | |
| "summarization", | |
| model=ABSTRACT_MODEL, | |
| tokenizer=abs_tokenizer, | |
| device=0 if torch.cuda.is_available() else -1 | |
| ) | |
| # Title model | |
| title_tokenizer = AutoTokenizer.from_pretrained(TITLE_MODEL) | |
| title_pipe = pipeline( | |
| "text2text-generation", | |
| model=TITLE_MODEL, | |
| tokenizer=title_tokenizer, | |
| max_length=60 | |
| ) | |
| return abstractive, title_pipe, abs_tokenizer, title_tokenizer | |
| def extract_text(pdf_file): | |
| """Handle both text and image-based PDFs""" | |
| try: | |
| # First try regular text extraction | |
| reader = PdfReader(pdf_file) | |
| text = " ".join([page.extract_text() or "" for page in reader.pages]) | |
| # Fallback to OCR if no text found | |
| if not text.strip(): | |
| images = convert_from_bytes(pdf_file.getvalue()) | |
| text = " ".join([pytesseract.image_to_string(img) for img in images]) | |
| return clean_text(text) | |
| except Exception as e: | |
| st.error(f"PDF Error: {str(e)}") | |
| return "" | |
| def clean_text(text): | |
| """Remove headers/footers/section numbers""" | |
| patterns = [ | |
| r'\n\s*(\d+)\s*\n', # Page numbers | |
| r'Proceedings of .*?\n', # Conference headers | |
| r'arXiv:\d+\.\d+v\d+.*?\n', # arXiv footers | |
| r'©\d{4}.*?\n', # Copyright | |
| r'http\S+', # URLs | |
| r'\b(?:Figure|Table)\s+\d+' # Figure/table captions | |
| ] | |
| for pattern in patterns: | |
| text = re.sub(pattern, '', text, flags=re.IGNORECASE) | |
| return text.strip() | |
| def generate_title(abstract, title_pipe): | |
| """Generate a concise and meaningful research paper title (4-5 words).""" | |
| prompt = f"Generate a short, research-style title (4-5 words) for this abstract: {abstract}" | |
| title = title_pipe( | |
| prompt, | |
| num_beams=5, | |
| early_stopping=True, | |
| max_length=10, # Limit to ~4-5 words | |
| do_sample=False | |
| )[0]['generated_text'].strip() | |
| # Remove unwanted tokens | |
| title = title.replace("<pad>", "").replace("</s>", "").strip() | |
| # Ensure title is concise (4-5 words) | |
| words = title.split() | |
| if len(words) > 5: | |
| title = " ".join(words[:5]) # Keep only the first 5 words | |
| return title | |
| def main(): | |
| # Main title | |
| st.markdown("<h1 style='text-align: center;'>RESEARCH PAPER TITLE AND ABSTRACT GENERATION</h1>", | |
| unsafe_allow_html=True) | |
| # Upload section | |
| col1, col2 = st.columns([4, 1]) | |
| with col1: | |
| uploaded_file = st.file_uploader("Upload here", type=["pdf"], label_visibility="collapsed") | |
| with col2: | |
| generate_btn = st.button("ENTER", use_container_width=True) | |
| if generate_btn and uploaded_file: | |
| if uploaded_file.size > MAX_FILE_SIZE_MB * 1024 * 1024: | |
| st.error(f"File too large! Max {MAX_FILE_SIZE_MB}MB allowed") | |
| return | |
| raw_text = extract_text(uploaded_file) | |
| if not raw_text.strip(): | |
| st.warning("No text extracted - document might be corrupted") | |
| return | |
| abstract_pipe, title_pipe, abs_tokenizer, title_tokenizer = load_models() | |
| with st.status("Processing...", expanded=True) as status: | |
| try: | |
| # Processing steps | |
| st.write("📖 Analyzing document...") | |
| clean_abstract_text = raw_text[:2000] # First 2000 characters | |
| st.write("✍️ Generating abstract...") | |
| abstract = abstract_pipe( | |
| clean_abstract_text, | |
| max_length=150, | |
| min_length=50, | |
| do_sample=False | |
| )[0]['summary_text'] | |
| st.write("🖋️ Creating title...") | |
| title = generate_title(abstract, title_pipe) | |
| status.update(label="Complete!", state="complete", expanded=False) | |
| # Display results | |
| st.markdown(f""" | |
| <div style='margin-top: 30px;'> | |
| <p style='font-size: 14px; font-weight: bold;'>TITLE</p> | |
| <p style='font-size: 14px; margin-bottom: 20px;'>{title}</p> | |
| <p style='font-size: 12px; font-weight: bold;'>ABSTRACT</p> | |
| <p style='font-size: 12px;'>{abstract}</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| except Exception as e: | |
| st.error(f"Processing failed: {str(e)}") | |
| if __name__ == "__main__": | |
| main() | |