Spaces:

midlajvalappil
/

AI-Note-Summarizer

Sleeping

App Files Files Community

midlajvalappil commited on Jul 17, 2025

Commit

1ae86a7

verified ·

1 Parent(s): fdec505

Upload 10 files

Browse files

Files changed (10) hide show

src/modules/__init__.py +6 -0
src/modules/__pycache__/__init__.cpython-310.pyc +0 -0
src/modules/__pycache__/pdf_processor.cpython-310.pyc +0 -0
src/modules/__pycache__/text_summarizer.cpython-310.pyc +0 -0
src/modules/__pycache__/utils.cpython-310.pyc +0 -0
src/modules/pdf_processor.py +160 -0
src/modules/text_summarizer.py +268 -0
src/modules/utils.py +122 -0
src/setup.py +42 -0
src/test_basic.py +135 -0

src/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+AI Notes Summarizer Modules
+"""
+__version__ = "1.0.0"
+__author__ = "AI Notes Summarizer"

src/modules/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (260 Bytes). View file

src/modules/__pycache__/pdf_processor.cpython-310.pyc ADDED Viewed

Binary file (4.38 kB). View file

src/modules/__pycache__/text_summarizer.cpython-310.pyc ADDED Viewed

Binary file (16.1 kB). View file

src/modules/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (3.29 kB). View file

src/modules/pdf_processor.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""
+PDF Processing Module
+Handles PDF file upload, text extraction, and preprocessing.
+"""
+import PyPDF2
+import io
+import re
+from typing import Optional, List
+import streamlit as st
+class PDFProcessor:
+    """Class to handle PDF file processing and text extraction"""
+    def __init__(self):
+        self.max_file_size = 10 * 1024 * 1024  # 10MB limit
+    def validate_pdf(self, uploaded_file) -> bool:
+        """
+        Validate uploaded PDF file
+        Args:
+            uploaded_file: Streamlit uploaded file object
+        Returns:
+            bool: True if valid, False otherwise
+        """
+        # Check file size
+        if uploaded_file.size > self.max_file_size:
+            st.error(f"File size ({uploaded_file.size / 1024 / 1024:.1f}MB) exceeds limit (10MB)")
+            return False
+        # Check file type
+        if uploaded_file.type != "application/pdf":
+            st.error("Please upload a valid PDF file")
+            return False
+        return True
+    def extract_text_from_pdf(self, uploaded_file) -> Optional[str]:
+        """
+        Extract text content from uploaded PDF file
+        Args:
+            uploaded_file: Streamlit uploaded file object
+        Returns:
+            str: Extracted text content or None if extraction fails
+        """
+        try:
+            # Reset file pointer
+            uploaded_file.seek(0)
+            # Create a PDF reader object
+            pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
+            # Check if PDF is encrypted
+            if pdf_reader.is_encrypted:
+                st.error("❌ Cannot process encrypted PDF files. Please upload an unencrypted PDF.")
+                return None
+            # Check number of pages
+            num_pages = len(pdf_reader.pages)
+            if num_pages == 0:
+                st.error("❌ PDF file appears to be empty or corrupted.")
+                return None
+            if num_pages > 100:
+                st.warning(f"⚠️ Large PDF detected ({num_pages} pages). Processing may take longer.")
+            # Extract text from all pages
+            text_content = ""
+            failed_pages = []
+            for page_num, page in enumerate(pdf_reader.pages):
+                try:
+                    page_text = page.extract_text()
+                    if page_text.strip():  # Only add non-empty pages
+                        text_content += page_text + "\n"
+                except Exception as e:
+                    failed_pages.append(page_num + 1)
+                    continue
+            # Report failed pages
+            if failed_pages:
+                if len(failed_pages) < 5:
+                    st.warning(f"⚠️ Could not extract text from pages: {', '.join(map(str, failed_pages))}")
+                else:
+                    st.warning(f"⚠️ Could not extract text from {len(failed_pages)} pages")
+            if not text_content.strip():
+                st.error("❌ No readable text content found in the PDF file. The PDF might contain only images or scanned content.")
+                return None
+            # Check if extracted text is too short
+            if len(text_content.strip()) < 100:
+                st.warning("⚠️ Very little text was extracted. The PDF might contain mostly images or have formatting issues.")
+            return text_content
+        except PyPDF2.errors.PdfReadError as e:
+            st.error(f"❌ Invalid or corrupted PDF file: {str(e)}")
+            return None
+        except MemoryError:
+            st.error("❌ PDF file is too large to process. Please try a smaller file.")
+            return None
+        except Exception as e:
+            st.error(f"❌ Unexpected error processing PDF file: {str(e)}")
+            return None
+    def preprocess_text(self, text: str) -> str:
+        """
+        Clean and preprocess extracted text
+        Args:
+            text: Raw extracted text
+        Returns:
+            str: Cleaned and preprocessed text
+        """
+        if not text:
+            return ""
+        # Remove excessive whitespace and newlines
+        text = re.sub(r'\n+', '\n', text)
+        text = re.sub(r'\s+', ' ', text)
+        # Remove special characters that might interfere with processing
+        text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text)
+        # Remove extra spaces
+        text = ' '.join(text.split())
+        return text.strip()
+    def process_pdf(self, uploaded_file) -> Optional[str]:
+        """
+        Complete PDF processing pipeline
+        Args:
+            uploaded_file: Streamlit uploaded file object
+        Returns:
+            str: Processed text content or None if processing fails
+        """
+        if not self.validate_pdf(uploaded_file):
+            return None
+        # Extract text
+        raw_text = self.extract_text_from_pdf(uploaded_file)
+        if raw_text is None:
+            return None
+        # Preprocess text
+        processed_text = self.preprocess_text(raw_text)
+        if len(processed_text) < 50:
+            st.warning("The extracted text is very short. Please check if the PDF contains readable text.")
+        return processed_text

src/modules/text_summarizer.py ADDED Viewed

	@@ -0,0 +1,268 @@

+"""
+Text Summarization Module
+Handles text summarization using Hugging Face Transformers.
+"""
+from transformers import pipeline, AutoTokenizer
+import torch
+from typing import List, Optional
+import streamlit as st
+import re
+class TextSummarizer:
+    """Class to handle text summarization using pre-trained models"""
+    def __init__(self, model_name: str = "facebook/bart-large-cnn"):
+        """
+        Initialize the text summarizer
+        Args:
+            model_name: Name of the pre-trained model to use
+        """
+        self.model_name = model_name
+        self.summarizer = None
+        self.tokenizer = None
+        self.max_chunk_length = 1024  # Maximum tokens per chunk
+        self.min_summary_length = 50
+        self.max_summary_length = 300
+    @st.cache_resource
+    def load_model(_self):
+        """
+        Load the summarization model and tokenizer
+        """
+        try:
+            # Check if CUDA is available
+            device = 0 if torch.cuda.is_available() else -1
+            # Show device info
+            if torch.cuda.is_available():
+                st.info(f"🚀 Using GPU acceleration: {torch.cuda.get_device_name()}")
+            else:
+                st.info("💻 Using CPU for processing (this may be slower)")
+            # Load the summarization pipeline
+            _self.summarizer = pipeline(
+                "summarization",
+                model=_self.model_name,
+                device=device,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+            )
+            # Load tokenizer for text chunking
+            _self.tokenizer = AutoTokenizer.from_pretrained(_self.model_name)
+            st.success(f"✅ Model loaded successfully: {_self.model_name}")
+            return True
+        except OSError as e:
+            if "Connection error" in str(e) or "timeout" in str(e).lower():
+                st.error("❌ Network error: Could not download the model. Please check your internet connection.")
+            else:
+                st.error(f"❌ Model loading error: {str(e)}")
+            return False
+        except RuntimeError as e:
+            if "CUDA" in str(e):
+                st.error("❌ GPU memory error. Trying to use CPU instead...")
+                try:
+                    _self.summarizer = pipeline(
+                        "summarization",
+                        model=_self.model_name,
+                        device=-1,  # Force CPU
+                        torch_dtype=torch.float32
+                    )
+                    _self.tokenizer = AutoTokenizer.from_pretrained(_self.model_name)
+                    st.success("✅ Model loaded successfully on CPU")
+                    return True
+                except Exception as cpu_e:
+                    st.error(f"❌ Failed to load model on CPU: {str(cpu_e)}")
+                    return False
+            else:
+                st.error(f"❌ Runtime error loading model: {str(e)}")
+                return False
+        except Exception as e:
+            st.error(f"❌ Unexpected error loading model: {str(e)}")
+            return False
+    def chunk_text(self, text: str) -> List[str]:
+        """
+        Split long text into smaller chunks for processing
+        Args:
+            text: Input text to chunk
+        Returns:
+            List[str]: List of text chunks
+        """
+        if not self.tokenizer:
+            # Fallback chunking by sentences if tokenizer not available
+            sentences = re.split(r'[.!?]+', text)
+            chunks = []
+            current_chunk = ""
+            for sentence in sentences:
+                if len(current_chunk) + len(sentence) < 2000:  # Rough character limit
+                    current_chunk += sentence + ". "
+                else:
+                    if current_chunk:
+                        chunks.append(current_chunk.strip())
+                    current_chunk = sentence + ". "
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            return chunks
+        # Use tokenizer for precise chunking
+        tokens = self.tokenizer.encode(text)
+        chunks = []
+        for i in range(0, len(tokens), self.max_chunk_length):
+            chunk_tokens = tokens[i:i + self.max_chunk_length]
+            chunk_text = self.tokenizer.decode(chunk_tokens, skip_special_tokens=True)
+            chunks.append(chunk_text)
+        return chunks
+    def summarize_chunk(self, chunk: str) -> Optional[str]:
+        """
+        Summarize a single text chunk
+        Args:
+            chunk: Text chunk to summarize
+        Returns:
+            str: Summary of the chunk or None if summarization fails
+        """
+        try:
+            # Adjust summary length based on chunk length
+            chunk_length = len(chunk.split())
+            max_length = min(self.max_summary_length, max(self.min_summary_length, chunk_length // 3))
+            min_length = min(self.min_summary_length, max_length // 2)
+            summary = self.summarizer(
+                chunk,
+                max_length=max_length,
+                min_length=min_length,
+                do_sample=False,
+                truncation=True
+            )
+            return summary[0]['summary_text']
+        except Exception as e:
+            st.warning(f"Error summarizing chunk: {str(e)}")
+            return None
+    def format_as_bullets(self, summary_text: str) -> str:
+        """
+        Format summary text as bullet points
+        Args:
+            summary_text: Raw summary text
+        Returns:
+            str: Formatted bullet points
+        """
+        # Split by sentences and create bullet points
+        sentences = re.split(r'[.!?]+', summary_text)
+        bullets = []
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if sentence and len(sentence) > 10:  # Filter out very short fragments
+                bullets.append(f"• {sentence}")
+        return '\n'.join(bullets)
+    def summarize_text(self, text: str) -> Optional[str]:
+        """
+        Complete text summarization pipeline
+        Args:
+            text: Input text to summarize
+        Returns:
+            str: Formatted summary or None if summarization fails
+        """
+        if not text or len(text.strip()) < 100:
+            st.error("❌ Text is too short to summarize effectively (minimum 100 characters required)")
+            return None
+        # Check text length limits
+        word_count = len(text.split())
+        if word_count > 10000:
+            st.warning(f"⚠️ Large text detected ({word_count:,} words). Processing may take several minutes.")
+        try:
+            # Load model if not already loaded
+            if not self.summarizer:
+                with st.spinner("🤖 Loading AI model..."):
+                    if not self.load_model():
+                        return None
+            # Chunk the text
+            chunks = self.chunk_text(text)
+            if len(chunks) == 0:
+                st.error("❌ Could not process the text into chunks")
+                return None
+            st.info(f"📄 Processing {len(chunks)} text chunk(s)...")
+            # Summarize each chunk
+            summaries = []
+            progress_bar = st.progress(0)
+            failed_chunks = 0
+            for i, chunk in enumerate(chunks):
+                try:
+                    with st.spinner(f"🔄 Summarizing part {i+1} of {len(chunks)}..."):
+                        chunk_summary = self.summarize_chunk(chunk)
+                        if chunk_summary:
+                            summaries.append(chunk_summary)
+                        else:
+                            failed_chunks += 1
+                except Exception as e:
+                    st.warning(f"⚠️ Failed to summarize chunk {i+1}: {str(e)}")
+                    failed_chunks += 1
+                    continue
+                progress_bar.progress((i + 1) / len(chunks))
+            # Check if we have any successful summaries
+            if not summaries:
+                st.error("❌ Could not generate any summaries from the text")
+                return None
+            if failed_chunks > 0:
+                st.warning(f"⚠️ {failed_chunks} out of {len(chunks)} chunks failed to process")
+            # Combine summaries
+            combined_summary = ' '.join(summaries)
+            # If we have multiple chunks, summarize the combined summary
+            if len(chunks) > 1 and len(combined_summary.split()) > 200:
+                try:
+                    with st.spinner("🔄 Creating final summary..."):
+                        final_summary = self.summarize_chunk(combined_summary)
+                        if final_summary:
+                            combined_summary = final_summary
+                except Exception as e:
+                    st.warning(f"⚠️ Could not create final summary, using combined chunks: {str(e)}")
+            # Format as bullet points
+            formatted_summary = self.format_as_bullets(combined_summary)
+            if not formatted_summary.strip():
+                st.error("❌ Generated summary is empty")
+                return None
+            return formatted_summary
+        except MemoryError:
+            st.error("❌ Out of memory. Please try with a shorter text or restart the application.")
+            return None
+        except Exception as e:
+            st.error(f"❌ Unexpected error during summarization: {str(e)}")
+            return None

src/modules/utils.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+Utility functions for the AI Notes Summarizer application
+"""
+import logging
+import streamlit as st
+from typing import Optional
+import re
+def setup_logging():
+    """Setup logging configuration"""
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.FileHandler('app.log'),
+            logging.StreamHandler()
+        ]
+    )
+    return logging.getLogger(__name__)
+def validate_input(text: str, min_length: int = 50) -> bool:
+    """
+    Validate input text
+    Args:
+        text: Input text to validate
+        min_length: Minimum required length
+    Returns:
+        bool: True if valid, False otherwise
+    """
+    if not text or not text.strip():
+        st.error("Please provide some text content")
+        return False
+    if len(text.strip()) < min_length:
+        st.error(f"Text is too short. Please provide at least {min_length} characters.")
+        return False
+    return True
+def clean_text(text: str) -> str:
+    """
+    Clean and normalize text content
+    Args:
+        text: Raw text content
+    Returns:
+        str: Cleaned text
+    """
+    if not text:
+        return ""
+    # Remove excessive whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Remove special characters but keep punctuation
+    text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/]', ' ', text)
+    # Clean up multiple spaces
+    text = ' '.join(text.split())
+    return text.strip()
+def format_file_size(size_bytes: int) -> str:
+    """
+    Format file size in human readable format
+    Args:
+        size_bytes: Size in bytes
+    Returns:
+        str: Formatted size string
+    """
+    if size_bytes < 1024:
+        return f"{size_bytes} B"
+    elif size_bytes < 1024 * 1024:
+        return f"{size_bytes / 1024:.1f} KB"
+    else:
+        return f"{size_bytes / (1024 * 1024):.1f} MB"
+def display_summary_stats(original_text: str, summary: str):
+    """
+    Display statistics about the summarization
+    Args:
+        original_text: Original input text
+        summary: Generated summary
+    """
+    original_words = len(original_text.split())
+    summary_words = len(summary.split())
+    compression_ratio = (1 - summary_words / original_words) * 100 if original_words > 0 else 0
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.metric("Original Words", f"{original_words:,}")
+    with col2:
+        st.metric("Summary Words", f"{summary_words:,}")
+    with col3:
+        st.metric("Compression", f"{compression_ratio:.1f}%")
+def create_download_link(content: str, filename: str = "summary.txt") -> str:
+    """
+    Create a download link for the summary
+    Args:
+        content: Content to download
+        filename: Name of the file
+    Returns:
+        str: Download link HTML
+    """
+    import base64
+    b64 = base64.b64encode(content.encode()).decode()
+    href = f'<a href="data:text/plain;base64,{b64}" download="{filename}">Download Summary</a>'
+    return href

src/setup.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+Setup script for AI Notes Summarizer
+"""
+from setuptools import setup, find_packages
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+with open("requirements.txt", "r", encoding="utf-8") as fh:
+    requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
+setup(
+    name="ai-notes-summarizer",
+    version="1.0.0",
+    author="AI Notes Summarizer",
+    description="A web application for AI-powered document summarization",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    packages=find_packages(),
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Education",
+        "Intended Audience :: End Users/Desktop",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Text Processing :: Linguistic",
+    ],
+    python_requires=">=3.8",
+    install_requires=requirements,
+    entry_points={
+        "console_scripts": [
+            "ai-notes-summarizer=app:main",
+        ],
+    },
+)

src/test_basic.py ADDED Viewed

	@@ -0,0 +1,135 @@

+#!/usr/bin/env python3
+"""
+Basic tests for AI Notes Summarizer modules
+"""
+import sys
+import os
+# Add the current directory to Python path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+def test_imports():
+    """Test if all modules can be imported"""
+    print("Testing module imports...")
+    try:
+        from modules.pdf_processor import PDFProcessor
+        print("✅ PDF Processor imported successfully")
+    except ImportError as e:
+        print(f"❌ Failed to import PDF Processor: {e}")
+        return False
+    try:
+        from modules.text_summarizer import TextSummarizer
+        print("✅ Text Summarizer imported successfully")
+    except ImportError as e:
+        print(f"❌ Failed to import Text Summarizer: {e}")
+        return False
+    try:
+        from modules.utils import setup_logging, validate_input
+        print("✅ Utils imported successfully")
+    except ImportError as e:
+        print(f"❌ Failed to import Utils: {e}")
+        return False
+    return True
+def test_pdf_processor():
+    """Test PDF processor basic functionality"""
+    print("\nTesting PDF Processor...")
+    try:
+        from modules.pdf_processor import PDFProcessor
+        processor = PDFProcessor()
+        # Test text preprocessing
+        test_text = "This is a   test\n\nwith multiple   spaces\nand newlines."
+        cleaned = processor.preprocess_text(test_text)
+        print(f"✅ Text preprocessing works: '{cleaned}'")
+        return True
+    except Exception as e:
+        print(f"❌ PDF Processor test failed: {e}")
+        return False
+def test_text_summarizer():
+    """Test text summarizer basic functionality"""
+    print("\nTesting Text Summarizer...")
+    try:
+        from modules.text_summarizer import TextSummarizer
+        summarizer = TextSummarizer()
+        # Test text chunking without model loading
+        test_text = "This is a test sentence. " * 100
+        chunks = summarizer.chunk_text(test_text)
+        print(f"✅ Text chunking works: {len(chunks)} chunks created")
+        # Test bullet formatting
+        test_summary = "This is the first point. This is the second point. This is the third point."
+        bullets = summarizer.format_as_bullets(test_summary)
+        print(f"✅ Bullet formatting works:\n{bullets}")
+        return True
+    except Exception as e:
+        print(f"❌ Text Summarizer test failed: {e}")
+        return False
+def test_utils():
+    """Test utility functions"""
+    print("\nTesting Utils...")
+    try:
+        from modules.utils import validate_input, clean_text, format_file_size
+        # Test input validation
+        valid = validate_input("This is a test text that is long enough to pass validation.")
+        print(f"✅ Input validation works: {valid}")
+        # Test text cleaning
+        dirty_text = "This   has    multiple   spaces  and  special@#$%characters!"
+        clean = clean_text(dirty_text)
+        print(f"✅ Text cleaning works: '{clean}'")
+        # Test file size formatting
+        size_str = format_file_size(1024 * 1024)
+        print(f"✅ File size formatting works: {size_str}")
+        return True
+    except Exception as e:
+        print(f"❌ Utils test failed: {e}")
+        return False
+def main():
+    """Run all tests"""
+    print("🧪 Running Basic Tests for AI Notes Summarizer\n")
+    tests = [
+        test_imports,
+        test_pdf_processor,
+        test_text_summarizer,
+        test_utils
+    ]
+    passed = 0
+    total = len(tests)
+    for test in tests:
+        if test():
+            passed += 1
+        print()
+    print(f"📊 Test Results: {passed}/{total} tests passed")
+    if passed == total:
+        print("🎉 All tests passed! The application is ready to run.")
+        return True
+    else:
+        print("⚠️ Some tests failed. Please check the errors above.")
+        return False
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)