Spaces:
Sleeping
Sleeping
Upload 10 files
Browse files- src/modules/__init__.py +6 -0
- src/modules/__pycache__/__init__.cpython-310.pyc +0 -0
- src/modules/__pycache__/pdf_processor.cpython-310.pyc +0 -0
- src/modules/__pycache__/text_summarizer.cpython-310.pyc +0 -0
- src/modules/__pycache__/utils.cpython-310.pyc +0 -0
- src/modules/pdf_processor.py +160 -0
- src/modules/text_summarizer.py +268 -0
- src/modules/utils.py +122 -0
- src/setup.py +42 -0
- src/test_basic.py +135 -0
src/modules/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AI Notes Summarizer Modules
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
__version__ = "1.0.0"
|
| 6 |
+
__author__ = "AI Notes Summarizer"
|
src/modules/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (260 Bytes). View file
|
|
|
src/modules/__pycache__/pdf_processor.cpython-310.pyc
ADDED
|
Binary file (4.38 kB). View file
|
|
|
src/modules/__pycache__/text_summarizer.cpython-310.pyc
ADDED
|
Binary file (16.1 kB). View file
|
|
|
src/modules/__pycache__/utils.cpython-310.pyc
ADDED
|
Binary file (3.29 kB). View file
|
|
|
src/modules/pdf_processor.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PDF Processing Module
|
| 3 |
+
Handles PDF file upload, text extraction, and preprocessing.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import PyPDF2
|
| 7 |
+
import io
|
| 8 |
+
import re
|
| 9 |
+
from typing import Optional, List
|
| 10 |
+
import streamlit as st
|
| 11 |
+
|
| 12 |
+
class PDFProcessor:
|
| 13 |
+
"""Class to handle PDF file processing and text extraction"""
|
| 14 |
+
|
| 15 |
+
def __init__(self):
|
| 16 |
+
self.max_file_size = 10 * 1024 * 1024 # 10MB limit
|
| 17 |
+
|
| 18 |
+
def validate_pdf(self, uploaded_file) -> bool:
|
| 19 |
+
"""
|
| 20 |
+
Validate uploaded PDF file
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
uploaded_file: Streamlit uploaded file object
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
bool: True if valid, False otherwise
|
| 27 |
+
"""
|
| 28 |
+
# Check file size
|
| 29 |
+
if uploaded_file.size > self.max_file_size:
|
| 30 |
+
st.error(f"File size ({uploaded_file.size / 1024 / 1024:.1f}MB) exceeds limit (10MB)")
|
| 31 |
+
return False
|
| 32 |
+
|
| 33 |
+
# Check file type
|
| 34 |
+
if uploaded_file.type != "application/pdf":
|
| 35 |
+
st.error("Please upload a valid PDF file")
|
| 36 |
+
return False
|
| 37 |
+
|
| 38 |
+
return True
|
| 39 |
+
|
| 40 |
+
def extract_text_from_pdf(self, uploaded_file) -> Optional[str]:
|
| 41 |
+
"""
|
| 42 |
+
Extract text content from uploaded PDF file
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
uploaded_file: Streamlit uploaded file object
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
str: Extracted text content or None if extraction fails
|
| 49 |
+
"""
|
| 50 |
+
try:
|
| 51 |
+
# Reset file pointer
|
| 52 |
+
uploaded_file.seek(0)
|
| 53 |
+
|
| 54 |
+
# Create a PDF reader object
|
| 55 |
+
pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
|
| 56 |
+
|
| 57 |
+
# Check if PDF is encrypted
|
| 58 |
+
if pdf_reader.is_encrypted:
|
| 59 |
+
st.error("β Cannot process encrypted PDF files. Please upload an unencrypted PDF.")
|
| 60 |
+
return None
|
| 61 |
+
|
| 62 |
+
# Check number of pages
|
| 63 |
+
num_pages = len(pdf_reader.pages)
|
| 64 |
+
if num_pages == 0:
|
| 65 |
+
st.error("β PDF file appears to be empty or corrupted.")
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
if num_pages > 100:
|
| 69 |
+
st.warning(f"β οΈ Large PDF detected ({num_pages} pages). Processing may take longer.")
|
| 70 |
+
|
| 71 |
+
# Extract text from all pages
|
| 72 |
+
text_content = ""
|
| 73 |
+
failed_pages = []
|
| 74 |
+
|
| 75 |
+
for page_num, page in enumerate(pdf_reader.pages):
|
| 76 |
+
try:
|
| 77 |
+
page_text = page.extract_text()
|
| 78 |
+
if page_text.strip(): # Only add non-empty pages
|
| 79 |
+
text_content += page_text + "\n"
|
| 80 |
+
except Exception as e:
|
| 81 |
+
failed_pages.append(page_num + 1)
|
| 82 |
+
continue
|
| 83 |
+
|
| 84 |
+
# Report failed pages
|
| 85 |
+
if failed_pages:
|
| 86 |
+
if len(failed_pages) < 5:
|
| 87 |
+
st.warning(f"β οΈ Could not extract text from pages: {', '.join(map(str, failed_pages))}")
|
| 88 |
+
else:
|
| 89 |
+
st.warning(f"β οΈ Could not extract text from {len(failed_pages)} pages")
|
| 90 |
+
|
| 91 |
+
if not text_content.strip():
|
| 92 |
+
st.error("β No readable text content found in the PDF file. The PDF might contain only images or scanned content.")
|
| 93 |
+
return None
|
| 94 |
+
|
| 95 |
+
# Check if extracted text is too short
|
| 96 |
+
if len(text_content.strip()) < 100:
|
| 97 |
+
st.warning("β οΈ Very little text was extracted. The PDF might contain mostly images or have formatting issues.")
|
| 98 |
+
|
| 99 |
+
return text_content
|
| 100 |
+
|
| 101 |
+
except PyPDF2.errors.PdfReadError as e:
|
| 102 |
+
st.error(f"β Invalid or corrupted PDF file: {str(e)}")
|
| 103 |
+
return None
|
| 104 |
+
except MemoryError:
|
| 105 |
+
st.error("β PDF file is too large to process. Please try a smaller file.")
|
| 106 |
+
return None
|
| 107 |
+
except Exception as e:
|
| 108 |
+
st.error(f"β Unexpected error processing PDF file: {str(e)}")
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
def preprocess_text(self, text: str) -> str:
|
| 112 |
+
"""
|
| 113 |
+
Clean and preprocess extracted text
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
text: Raw extracted text
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
str: Cleaned and preprocessed text
|
| 120 |
+
"""
|
| 121 |
+
if not text:
|
| 122 |
+
return ""
|
| 123 |
+
|
| 124 |
+
# Remove excessive whitespace and newlines
|
| 125 |
+
text = re.sub(r'\n+', '\n', text)
|
| 126 |
+
text = re.sub(r'\s+', ' ', text)
|
| 127 |
+
|
| 128 |
+
# Remove special characters that might interfere with processing
|
| 129 |
+
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text)
|
| 130 |
+
|
| 131 |
+
# Remove extra spaces
|
| 132 |
+
text = ' '.join(text.split())
|
| 133 |
+
|
| 134 |
+
return text.strip()
|
| 135 |
+
|
| 136 |
+
def process_pdf(self, uploaded_file) -> Optional[str]:
|
| 137 |
+
"""
|
| 138 |
+
Complete PDF processing pipeline
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
uploaded_file: Streamlit uploaded file object
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
str: Processed text content or None if processing fails
|
| 145 |
+
"""
|
| 146 |
+
if not self.validate_pdf(uploaded_file):
|
| 147 |
+
return None
|
| 148 |
+
|
| 149 |
+
# Extract text
|
| 150 |
+
raw_text = self.extract_text_from_pdf(uploaded_file)
|
| 151 |
+
if raw_text is None:
|
| 152 |
+
return None
|
| 153 |
+
|
| 154 |
+
# Preprocess text
|
| 155 |
+
processed_text = self.preprocess_text(raw_text)
|
| 156 |
+
|
| 157 |
+
if len(processed_text) < 50:
|
| 158 |
+
st.warning("The extracted text is very short. Please check if the PDF contains readable text.")
|
| 159 |
+
|
| 160 |
+
return processed_text
|
src/modules/text_summarizer.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text Summarization Module
|
| 3 |
+
Handles text summarization using Hugging Face Transformers.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from transformers import pipeline, AutoTokenizer
|
| 7 |
+
import torch
|
| 8 |
+
from typing import List, Optional
|
| 9 |
+
import streamlit as st
|
| 10 |
+
import re
|
| 11 |
+
|
| 12 |
+
class TextSummarizer:
|
| 13 |
+
"""Class to handle text summarization using pre-trained models"""
|
| 14 |
+
|
| 15 |
+
def __init__(self, model_name: str = "facebook/bart-large-cnn"):
|
| 16 |
+
"""
|
| 17 |
+
Initialize the text summarizer
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
model_name: Name of the pre-trained model to use
|
| 21 |
+
"""
|
| 22 |
+
self.model_name = model_name
|
| 23 |
+
self.summarizer = None
|
| 24 |
+
self.tokenizer = None
|
| 25 |
+
self.max_chunk_length = 1024 # Maximum tokens per chunk
|
| 26 |
+
self.min_summary_length = 50
|
| 27 |
+
self.max_summary_length = 300
|
| 28 |
+
|
| 29 |
+
@st.cache_resource
|
| 30 |
+
def load_model(_self):
|
| 31 |
+
"""
|
| 32 |
+
Load the summarization model and tokenizer
|
| 33 |
+
"""
|
| 34 |
+
try:
|
| 35 |
+
# Check if CUDA is available
|
| 36 |
+
device = 0 if torch.cuda.is_available() else -1
|
| 37 |
+
|
| 38 |
+
# Show device info
|
| 39 |
+
if torch.cuda.is_available():
|
| 40 |
+
st.info(f"π Using GPU acceleration: {torch.cuda.get_device_name()}")
|
| 41 |
+
else:
|
| 42 |
+
st.info("π» Using CPU for processing (this may be slower)")
|
| 43 |
+
|
| 44 |
+
# Load the summarization pipeline
|
| 45 |
+
_self.summarizer = pipeline(
|
| 46 |
+
"summarization",
|
| 47 |
+
model=_self.model_name,
|
| 48 |
+
device=device,
|
| 49 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Load tokenizer for text chunking
|
| 53 |
+
_self.tokenizer = AutoTokenizer.from_pretrained(_self.model_name)
|
| 54 |
+
|
| 55 |
+
st.success(f"β
Model loaded successfully: {_self.model_name}")
|
| 56 |
+
return True
|
| 57 |
+
|
| 58 |
+
except OSError as e:
|
| 59 |
+
if "Connection error" in str(e) or "timeout" in str(e).lower():
|
| 60 |
+
st.error("β Network error: Could not download the model. Please check your internet connection.")
|
| 61 |
+
else:
|
| 62 |
+
st.error(f"β Model loading error: {str(e)}")
|
| 63 |
+
return False
|
| 64 |
+
except RuntimeError as e:
|
| 65 |
+
if "CUDA" in str(e):
|
| 66 |
+
st.error("β GPU memory error. Trying to use CPU instead...")
|
| 67 |
+
try:
|
| 68 |
+
_self.summarizer = pipeline(
|
| 69 |
+
"summarization",
|
| 70 |
+
model=_self.model_name,
|
| 71 |
+
device=-1, # Force CPU
|
| 72 |
+
torch_dtype=torch.float32
|
| 73 |
+
)
|
| 74 |
+
_self.tokenizer = AutoTokenizer.from_pretrained(_self.model_name)
|
| 75 |
+
st.success("β
Model loaded successfully on CPU")
|
| 76 |
+
return True
|
| 77 |
+
except Exception as cpu_e:
|
| 78 |
+
st.error(f"β Failed to load model on CPU: {str(cpu_e)}")
|
| 79 |
+
return False
|
| 80 |
+
else:
|
| 81 |
+
st.error(f"β Runtime error loading model: {str(e)}")
|
| 82 |
+
return False
|
| 83 |
+
except Exception as e:
|
| 84 |
+
st.error(f"β Unexpected error loading model: {str(e)}")
|
| 85 |
+
return False
|
| 86 |
+
|
| 87 |
+
def chunk_text(self, text: str) -> List[str]:
|
| 88 |
+
"""
|
| 89 |
+
Split long text into smaller chunks for processing
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
text: Input text to chunk
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
List[str]: List of text chunks
|
| 96 |
+
"""
|
| 97 |
+
if not self.tokenizer:
|
| 98 |
+
# Fallback chunking by sentences if tokenizer not available
|
| 99 |
+
sentences = re.split(r'[.!?]+', text)
|
| 100 |
+
chunks = []
|
| 101 |
+
current_chunk = ""
|
| 102 |
+
|
| 103 |
+
for sentence in sentences:
|
| 104 |
+
if len(current_chunk) + len(sentence) < 2000: # Rough character limit
|
| 105 |
+
current_chunk += sentence + ". "
|
| 106 |
+
else:
|
| 107 |
+
if current_chunk:
|
| 108 |
+
chunks.append(current_chunk.strip())
|
| 109 |
+
current_chunk = sentence + ". "
|
| 110 |
+
|
| 111 |
+
if current_chunk:
|
| 112 |
+
chunks.append(current_chunk.strip())
|
| 113 |
+
|
| 114 |
+
return chunks
|
| 115 |
+
|
| 116 |
+
# Use tokenizer for precise chunking
|
| 117 |
+
tokens = self.tokenizer.encode(text)
|
| 118 |
+
chunks = []
|
| 119 |
+
|
| 120 |
+
for i in range(0, len(tokens), self.max_chunk_length):
|
| 121 |
+
chunk_tokens = tokens[i:i + self.max_chunk_length]
|
| 122 |
+
chunk_text = self.tokenizer.decode(chunk_tokens, skip_special_tokens=True)
|
| 123 |
+
chunks.append(chunk_text)
|
| 124 |
+
|
| 125 |
+
return chunks
|
| 126 |
+
|
| 127 |
+
def summarize_chunk(self, chunk: str) -> Optional[str]:
|
| 128 |
+
"""
|
| 129 |
+
Summarize a single text chunk
|
| 130 |
+
|
| 131 |
+
Args:
|
| 132 |
+
chunk: Text chunk to summarize
|
| 133 |
+
|
| 134 |
+
Returns:
|
| 135 |
+
str: Summary of the chunk or None if summarization fails
|
| 136 |
+
"""
|
| 137 |
+
try:
|
| 138 |
+
# Adjust summary length based on chunk length
|
| 139 |
+
chunk_length = len(chunk.split())
|
| 140 |
+
max_length = min(self.max_summary_length, max(self.min_summary_length, chunk_length // 3))
|
| 141 |
+
min_length = min(self.min_summary_length, max_length // 2)
|
| 142 |
+
|
| 143 |
+
summary = self.summarizer(
|
| 144 |
+
chunk,
|
| 145 |
+
max_length=max_length,
|
| 146 |
+
min_length=min_length,
|
| 147 |
+
do_sample=False,
|
| 148 |
+
truncation=True
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
return summary[0]['summary_text']
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
st.warning(f"Error summarizing chunk: {str(e)}")
|
| 155 |
+
return None
|
| 156 |
+
|
| 157 |
+
def format_as_bullets(self, summary_text: str) -> str:
|
| 158 |
+
"""
|
| 159 |
+
Format summary text as bullet points
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
summary_text: Raw summary text
|
| 163 |
+
|
| 164 |
+
Returns:
|
| 165 |
+
str: Formatted bullet points
|
| 166 |
+
"""
|
| 167 |
+
# Split by sentences and create bullet points
|
| 168 |
+
sentences = re.split(r'[.!?]+', summary_text)
|
| 169 |
+
bullets = []
|
| 170 |
+
|
| 171 |
+
for sentence in sentences:
|
| 172 |
+
sentence = sentence.strip()
|
| 173 |
+
if sentence and len(sentence) > 10: # Filter out very short fragments
|
| 174 |
+
bullets.append(f"β’ {sentence}")
|
| 175 |
+
|
| 176 |
+
return '\n'.join(bullets)
|
| 177 |
+
|
| 178 |
+
def summarize_text(self, text: str) -> Optional[str]:
|
| 179 |
+
"""
|
| 180 |
+
Complete text summarization pipeline
|
| 181 |
+
|
| 182 |
+
Args:
|
| 183 |
+
text: Input text to summarize
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
str: Formatted summary or None if summarization fails
|
| 187 |
+
"""
|
| 188 |
+
if not text or len(text.strip()) < 100:
|
| 189 |
+
st.error("β Text is too short to summarize effectively (minimum 100 characters required)")
|
| 190 |
+
return None
|
| 191 |
+
|
| 192 |
+
# Check text length limits
|
| 193 |
+
word_count = len(text.split())
|
| 194 |
+
if word_count > 10000:
|
| 195 |
+
st.warning(f"β οΈ Large text detected ({word_count:,} words). Processing may take several minutes.")
|
| 196 |
+
|
| 197 |
+
try:
|
| 198 |
+
# Load model if not already loaded
|
| 199 |
+
if not self.summarizer:
|
| 200 |
+
with st.spinner("π€ Loading AI model..."):
|
| 201 |
+
if not self.load_model():
|
| 202 |
+
return None
|
| 203 |
+
|
| 204 |
+
# Chunk the text
|
| 205 |
+
chunks = self.chunk_text(text)
|
| 206 |
+
|
| 207 |
+
if len(chunks) == 0:
|
| 208 |
+
st.error("β Could not process the text into chunks")
|
| 209 |
+
return None
|
| 210 |
+
|
| 211 |
+
st.info(f"π Processing {len(chunks)} text chunk(s)...")
|
| 212 |
+
|
| 213 |
+
# Summarize each chunk
|
| 214 |
+
summaries = []
|
| 215 |
+
progress_bar = st.progress(0)
|
| 216 |
+
failed_chunks = 0
|
| 217 |
+
|
| 218 |
+
for i, chunk in enumerate(chunks):
|
| 219 |
+
try:
|
| 220 |
+
with st.spinner(f"π Summarizing part {i+1} of {len(chunks)}..."):
|
| 221 |
+
chunk_summary = self.summarize_chunk(chunk)
|
| 222 |
+
if chunk_summary:
|
| 223 |
+
summaries.append(chunk_summary)
|
| 224 |
+
else:
|
| 225 |
+
failed_chunks += 1
|
| 226 |
+
except Exception as e:
|
| 227 |
+
st.warning(f"β οΈ Failed to summarize chunk {i+1}: {str(e)}")
|
| 228 |
+
failed_chunks += 1
|
| 229 |
+
continue
|
| 230 |
+
|
| 231 |
+
progress_bar.progress((i + 1) / len(chunks))
|
| 232 |
+
|
| 233 |
+
# Check if we have any successful summaries
|
| 234 |
+
if not summaries:
|
| 235 |
+
st.error("β Could not generate any summaries from the text")
|
| 236 |
+
return None
|
| 237 |
+
|
| 238 |
+
if failed_chunks > 0:
|
| 239 |
+
st.warning(f"β οΈ {failed_chunks} out of {len(chunks)} chunks failed to process")
|
| 240 |
+
|
| 241 |
+
# Combine summaries
|
| 242 |
+
combined_summary = ' '.join(summaries)
|
| 243 |
+
|
| 244 |
+
# If we have multiple chunks, summarize the combined summary
|
| 245 |
+
if len(chunks) > 1 and len(combined_summary.split()) > 200:
|
| 246 |
+
try:
|
| 247 |
+
with st.spinner("π Creating final summary..."):
|
| 248 |
+
final_summary = self.summarize_chunk(combined_summary)
|
| 249 |
+
if final_summary:
|
| 250 |
+
combined_summary = final_summary
|
| 251 |
+
except Exception as e:
|
| 252 |
+
st.warning(f"β οΈ Could not create final summary, using combined chunks: {str(e)}")
|
| 253 |
+
|
| 254 |
+
# Format as bullet points
|
| 255 |
+
formatted_summary = self.format_as_bullets(combined_summary)
|
| 256 |
+
|
| 257 |
+
if not formatted_summary.strip():
|
| 258 |
+
st.error("β Generated summary is empty")
|
| 259 |
+
return None
|
| 260 |
+
|
| 261 |
+
return formatted_summary
|
| 262 |
+
|
| 263 |
+
except MemoryError:
|
| 264 |
+
st.error("β Out of memory. Please try with a shorter text or restart the application.")
|
| 265 |
+
return None
|
| 266 |
+
except Exception as e:
|
| 267 |
+
st.error(f"β Unexpected error during summarization: {str(e)}")
|
| 268 |
+
return None
|
src/modules/utils.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility functions for the AI Notes Summarizer application
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
import streamlit as st
|
| 7 |
+
from typing import Optional
|
| 8 |
+
import re
|
| 9 |
+
|
| 10 |
+
def setup_logging():
|
| 11 |
+
"""Setup logging configuration"""
|
| 12 |
+
logging.basicConfig(
|
| 13 |
+
level=logging.INFO,
|
| 14 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 15 |
+
handlers=[
|
| 16 |
+
logging.FileHandler('app.log'),
|
| 17 |
+
logging.StreamHandler()
|
| 18 |
+
]
|
| 19 |
+
)
|
| 20 |
+
return logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
def validate_input(text: str, min_length: int = 50) -> bool:
|
| 23 |
+
"""
|
| 24 |
+
Validate input text
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
text: Input text to validate
|
| 28 |
+
min_length: Minimum required length
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
bool: True if valid, False otherwise
|
| 32 |
+
"""
|
| 33 |
+
if not text or not text.strip():
|
| 34 |
+
st.error("Please provide some text content")
|
| 35 |
+
return False
|
| 36 |
+
|
| 37 |
+
if len(text.strip()) < min_length:
|
| 38 |
+
st.error(f"Text is too short. Please provide at least {min_length} characters.")
|
| 39 |
+
return False
|
| 40 |
+
|
| 41 |
+
return True
|
| 42 |
+
|
| 43 |
+
def clean_text(text: str) -> str:
|
| 44 |
+
"""
|
| 45 |
+
Clean and normalize text content
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
text: Raw text content
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
str: Cleaned text
|
| 52 |
+
"""
|
| 53 |
+
if not text:
|
| 54 |
+
return ""
|
| 55 |
+
|
| 56 |
+
# Remove excessive whitespace
|
| 57 |
+
text = re.sub(r'\s+', ' ', text)
|
| 58 |
+
|
| 59 |
+
# Remove special characters but keep punctuation
|
| 60 |
+
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/]', ' ', text)
|
| 61 |
+
|
| 62 |
+
# Clean up multiple spaces
|
| 63 |
+
text = ' '.join(text.split())
|
| 64 |
+
|
| 65 |
+
return text.strip()
|
| 66 |
+
|
| 67 |
+
def format_file_size(size_bytes: int) -> str:
|
| 68 |
+
"""
|
| 69 |
+
Format file size in human readable format
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
size_bytes: Size in bytes
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
str: Formatted size string
|
| 76 |
+
"""
|
| 77 |
+
if size_bytes < 1024:
|
| 78 |
+
return f"{size_bytes} B"
|
| 79 |
+
elif size_bytes < 1024 * 1024:
|
| 80 |
+
return f"{size_bytes / 1024:.1f} KB"
|
| 81 |
+
else:
|
| 82 |
+
return f"{size_bytes / (1024 * 1024):.1f} MB"
|
| 83 |
+
|
| 84 |
+
def display_summary_stats(original_text: str, summary: str):
|
| 85 |
+
"""
|
| 86 |
+
Display statistics about the summarization
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
original_text: Original input text
|
| 90 |
+
summary: Generated summary
|
| 91 |
+
"""
|
| 92 |
+
original_words = len(original_text.split())
|
| 93 |
+
summary_words = len(summary.split())
|
| 94 |
+
compression_ratio = (1 - summary_words / original_words) * 100 if original_words > 0 else 0
|
| 95 |
+
|
| 96 |
+
col1, col2, col3 = st.columns(3)
|
| 97 |
+
|
| 98 |
+
with col1:
|
| 99 |
+
st.metric("Original Words", f"{original_words:,}")
|
| 100 |
+
|
| 101 |
+
with col2:
|
| 102 |
+
st.metric("Summary Words", f"{summary_words:,}")
|
| 103 |
+
|
| 104 |
+
with col3:
|
| 105 |
+
st.metric("Compression", f"{compression_ratio:.1f}%")
|
| 106 |
+
|
| 107 |
+
def create_download_link(content: str, filename: str = "summary.txt") -> str:
|
| 108 |
+
"""
|
| 109 |
+
Create a download link for the summary
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
content: Content to download
|
| 113 |
+
filename: Name of the file
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
str: Download link HTML
|
| 117 |
+
"""
|
| 118 |
+
import base64
|
| 119 |
+
|
| 120 |
+
b64 = base64.b64encode(content.encode()).decode()
|
| 121 |
+
href = f'<a href="data:text/plain;base64,{b64}" download="{filename}">Download Summary</a>'
|
| 122 |
+
return href
|
src/setup.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Setup script for AI Notes Summarizer
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from setuptools import setup, find_packages
|
| 6 |
+
|
| 7 |
+
with open("README.md", "r", encoding="utf-8") as fh:
|
| 8 |
+
long_description = fh.read()
|
| 9 |
+
|
| 10 |
+
with open("requirements.txt", "r", encoding="utf-8") as fh:
|
| 11 |
+
requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
|
| 12 |
+
|
| 13 |
+
setup(
|
| 14 |
+
name="ai-notes-summarizer",
|
| 15 |
+
version="1.0.0",
|
| 16 |
+
author="AI Notes Summarizer",
|
| 17 |
+
description="A web application for AI-powered document summarization",
|
| 18 |
+
long_description=long_description,
|
| 19 |
+
long_description_content_type="text/markdown",
|
| 20 |
+
packages=find_packages(),
|
| 21 |
+
classifiers=[
|
| 22 |
+
"Development Status :: 4 - Beta",
|
| 23 |
+
"Intended Audience :: Education",
|
| 24 |
+
"Intended Audience :: End Users/Desktop",
|
| 25 |
+
"License :: OSI Approved :: MIT License",
|
| 26 |
+
"Operating System :: OS Independent",
|
| 27 |
+
"Programming Language :: Python :: 3",
|
| 28 |
+
"Programming Language :: Python :: 3.8",
|
| 29 |
+
"Programming Language :: Python :: 3.9",
|
| 30 |
+
"Programming Language :: Python :: 3.10",
|
| 31 |
+
"Programming Language :: Python :: 3.11",
|
| 32 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 33 |
+
"Topic :: Text Processing :: Linguistic",
|
| 34 |
+
],
|
| 35 |
+
python_requires=">=3.8",
|
| 36 |
+
install_requires=requirements,
|
| 37 |
+
entry_points={
|
| 38 |
+
"console_scripts": [
|
| 39 |
+
"ai-notes-summarizer=app:main",
|
| 40 |
+
],
|
| 41 |
+
},
|
| 42 |
+
)
|
src/test_basic.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Basic tests for AI Notes Summarizer modules
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# Add the current directory to Python path
|
| 10 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 11 |
+
|
| 12 |
+
def test_imports():
|
| 13 |
+
"""Test if all modules can be imported"""
|
| 14 |
+
print("Testing module imports...")
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
from modules.pdf_processor import PDFProcessor
|
| 18 |
+
print("β
PDF Processor imported successfully")
|
| 19 |
+
except ImportError as e:
|
| 20 |
+
print(f"β Failed to import PDF Processor: {e}")
|
| 21 |
+
return False
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
from modules.text_summarizer import TextSummarizer
|
| 25 |
+
print("β
Text Summarizer imported successfully")
|
| 26 |
+
except ImportError as e:
|
| 27 |
+
print(f"β Failed to import Text Summarizer: {e}")
|
| 28 |
+
return False
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
from modules.utils import setup_logging, validate_input
|
| 32 |
+
print("β
Utils imported successfully")
|
| 33 |
+
except ImportError as e:
|
| 34 |
+
print(f"β Failed to import Utils: {e}")
|
| 35 |
+
return False
|
| 36 |
+
|
| 37 |
+
return True
|
| 38 |
+
|
| 39 |
+
def test_pdf_processor():
|
| 40 |
+
"""Test PDF processor basic functionality"""
|
| 41 |
+
print("\nTesting PDF Processor...")
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
from modules.pdf_processor import PDFProcessor
|
| 45 |
+
processor = PDFProcessor()
|
| 46 |
+
|
| 47 |
+
# Test text preprocessing
|
| 48 |
+
test_text = "This is a test\n\nwith multiple spaces\nand newlines."
|
| 49 |
+
cleaned = processor.preprocess_text(test_text)
|
| 50 |
+
print(f"β
Text preprocessing works: '{cleaned}'")
|
| 51 |
+
|
| 52 |
+
return True
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f"β PDF Processor test failed: {e}")
|
| 55 |
+
return False
|
| 56 |
+
|
| 57 |
+
def test_text_summarizer():
|
| 58 |
+
"""Test text summarizer basic functionality"""
|
| 59 |
+
print("\nTesting Text Summarizer...")
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
from modules.text_summarizer import TextSummarizer
|
| 63 |
+
summarizer = TextSummarizer()
|
| 64 |
+
|
| 65 |
+
# Test text chunking without model loading
|
| 66 |
+
test_text = "This is a test sentence. " * 100
|
| 67 |
+
chunks = summarizer.chunk_text(test_text)
|
| 68 |
+
print(f"β
Text chunking works: {len(chunks)} chunks created")
|
| 69 |
+
|
| 70 |
+
# Test bullet formatting
|
| 71 |
+
test_summary = "This is the first point. This is the second point. This is the third point."
|
| 72 |
+
bullets = summarizer.format_as_bullets(test_summary)
|
| 73 |
+
print(f"β
Bullet formatting works:\n{bullets}")
|
| 74 |
+
|
| 75 |
+
return True
|
| 76 |
+
except Exception as e:
|
| 77 |
+
print(f"β Text Summarizer test failed: {e}")
|
| 78 |
+
return False
|
| 79 |
+
|
| 80 |
+
def test_utils():
|
| 81 |
+
"""Test utility functions"""
|
| 82 |
+
print("\nTesting Utils...")
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
from modules.utils import validate_input, clean_text, format_file_size
|
| 86 |
+
|
| 87 |
+
# Test input validation
|
| 88 |
+
valid = validate_input("This is a test text that is long enough to pass validation.")
|
| 89 |
+
print(f"β
Input validation works: {valid}")
|
| 90 |
+
|
| 91 |
+
# Test text cleaning
|
| 92 |
+
dirty_text = "This has multiple spaces and special@#$%characters!"
|
| 93 |
+
clean = clean_text(dirty_text)
|
| 94 |
+
print(f"β
Text cleaning works: '{clean}'")
|
| 95 |
+
|
| 96 |
+
# Test file size formatting
|
| 97 |
+
size_str = format_file_size(1024 * 1024)
|
| 98 |
+
print(f"β
File size formatting works: {size_str}")
|
| 99 |
+
|
| 100 |
+
return True
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f"β Utils test failed: {e}")
|
| 103 |
+
return False
|
| 104 |
+
|
| 105 |
+
def main():
|
| 106 |
+
"""Run all tests"""
|
| 107 |
+
print("π§ͺ Running Basic Tests for AI Notes Summarizer\n")
|
| 108 |
+
|
| 109 |
+
tests = [
|
| 110 |
+
test_imports,
|
| 111 |
+
test_pdf_processor,
|
| 112 |
+
test_text_summarizer,
|
| 113 |
+
test_utils
|
| 114 |
+
]
|
| 115 |
+
|
| 116 |
+
passed = 0
|
| 117 |
+
total = len(tests)
|
| 118 |
+
|
| 119 |
+
for test in tests:
|
| 120 |
+
if test():
|
| 121 |
+
passed += 1
|
| 122 |
+
print()
|
| 123 |
+
|
| 124 |
+
print(f"π Test Results: {passed}/{total} tests passed")
|
| 125 |
+
|
| 126 |
+
if passed == total:
|
| 127 |
+
print("π All tests passed! The application is ready to run.")
|
| 128 |
+
return True
|
| 129 |
+
else:
|
| 130 |
+
print("β οΈ Some tests failed. Please check the errors above.")
|
| 131 |
+
return False
|
| 132 |
+
|
| 133 |
+
if __name__ == "__main__":
|
| 134 |
+
success = main()
|
| 135 |
+
sys.exit(0 if success else 1)
|