midlajvalappil commited on
Commit
1ae86a7
Β·
verified Β·
1 Parent(s): fdec505

Upload 10 files

Browse files
src/modules/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ AI Notes Summarizer Modules
3
+ """
4
+
5
+ __version__ = "1.0.0"
6
+ __author__ = "AI Notes Summarizer"
src/modules/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (260 Bytes). View file
 
src/modules/__pycache__/pdf_processor.cpython-310.pyc ADDED
Binary file (4.38 kB). View file
 
src/modules/__pycache__/text_summarizer.cpython-310.pyc ADDED
Binary file (16.1 kB). View file
 
src/modules/__pycache__/utils.cpython-310.pyc ADDED
Binary file (3.29 kB). View file
 
src/modules/pdf_processor.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF Processing Module
3
+ Handles PDF file upload, text extraction, and preprocessing.
4
+ """
5
+
6
+ import PyPDF2
7
+ import io
8
+ import re
9
+ from typing import Optional, List
10
+ import streamlit as st
11
+
12
+ class PDFProcessor:
13
+ """Class to handle PDF file processing and text extraction"""
14
+
15
+ def __init__(self):
16
+ self.max_file_size = 10 * 1024 * 1024 # 10MB limit
17
+
18
+ def validate_pdf(self, uploaded_file) -> bool:
19
+ """
20
+ Validate uploaded PDF file
21
+
22
+ Args:
23
+ uploaded_file: Streamlit uploaded file object
24
+
25
+ Returns:
26
+ bool: True if valid, False otherwise
27
+ """
28
+ # Check file size
29
+ if uploaded_file.size > self.max_file_size:
30
+ st.error(f"File size ({uploaded_file.size / 1024 / 1024:.1f}MB) exceeds limit (10MB)")
31
+ return False
32
+
33
+ # Check file type
34
+ if uploaded_file.type != "application/pdf":
35
+ st.error("Please upload a valid PDF file")
36
+ return False
37
+
38
+ return True
39
+
40
+ def extract_text_from_pdf(self, uploaded_file) -> Optional[str]:
41
+ """
42
+ Extract text content from uploaded PDF file
43
+
44
+ Args:
45
+ uploaded_file: Streamlit uploaded file object
46
+
47
+ Returns:
48
+ str: Extracted text content or None if extraction fails
49
+ """
50
+ try:
51
+ # Reset file pointer
52
+ uploaded_file.seek(0)
53
+
54
+ # Create a PDF reader object
55
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
56
+
57
+ # Check if PDF is encrypted
58
+ if pdf_reader.is_encrypted:
59
+ st.error("❌ Cannot process encrypted PDF files. Please upload an unencrypted PDF.")
60
+ return None
61
+
62
+ # Check number of pages
63
+ num_pages = len(pdf_reader.pages)
64
+ if num_pages == 0:
65
+ st.error("❌ PDF file appears to be empty or corrupted.")
66
+ return None
67
+
68
+ if num_pages > 100:
69
+ st.warning(f"⚠️ Large PDF detected ({num_pages} pages). Processing may take longer.")
70
+
71
+ # Extract text from all pages
72
+ text_content = ""
73
+ failed_pages = []
74
+
75
+ for page_num, page in enumerate(pdf_reader.pages):
76
+ try:
77
+ page_text = page.extract_text()
78
+ if page_text.strip(): # Only add non-empty pages
79
+ text_content += page_text + "\n"
80
+ except Exception as e:
81
+ failed_pages.append(page_num + 1)
82
+ continue
83
+
84
+ # Report failed pages
85
+ if failed_pages:
86
+ if len(failed_pages) < 5:
87
+ st.warning(f"⚠️ Could not extract text from pages: {', '.join(map(str, failed_pages))}")
88
+ else:
89
+ st.warning(f"⚠️ Could not extract text from {len(failed_pages)} pages")
90
+
91
+ if not text_content.strip():
92
+ st.error("❌ No readable text content found in the PDF file. The PDF might contain only images or scanned content.")
93
+ return None
94
+
95
+ # Check if extracted text is too short
96
+ if len(text_content.strip()) < 100:
97
+ st.warning("⚠️ Very little text was extracted. The PDF might contain mostly images or have formatting issues.")
98
+
99
+ return text_content
100
+
101
+ except PyPDF2.errors.PdfReadError as e:
102
+ st.error(f"❌ Invalid or corrupted PDF file: {str(e)}")
103
+ return None
104
+ except MemoryError:
105
+ st.error("❌ PDF file is too large to process. Please try a smaller file.")
106
+ return None
107
+ except Exception as e:
108
+ st.error(f"❌ Unexpected error processing PDF file: {str(e)}")
109
+ return None
110
+
111
+ def preprocess_text(self, text: str) -> str:
112
+ """
113
+ Clean and preprocess extracted text
114
+
115
+ Args:
116
+ text: Raw extracted text
117
+
118
+ Returns:
119
+ str: Cleaned and preprocessed text
120
+ """
121
+ if not text:
122
+ return ""
123
+
124
+ # Remove excessive whitespace and newlines
125
+ text = re.sub(r'\n+', '\n', text)
126
+ text = re.sub(r'\s+', ' ', text)
127
+
128
+ # Remove special characters that might interfere with processing
129
+ text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text)
130
+
131
+ # Remove extra spaces
132
+ text = ' '.join(text.split())
133
+
134
+ return text.strip()
135
+
136
+ def process_pdf(self, uploaded_file) -> Optional[str]:
137
+ """
138
+ Complete PDF processing pipeline
139
+
140
+ Args:
141
+ uploaded_file: Streamlit uploaded file object
142
+
143
+ Returns:
144
+ str: Processed text content or None if processing fails
145
+ """
146
+ if not self.validate_pdf(uploaded_file):
147
+ return None
148
+
149
+ # Extract text
150
+ raw_text = self.extract_text_from_pdf(uploaded_file)
151
+ if raw_text is None:
152
+ return None
153
+
154
+ # Preprocess text
155
+ processed_text = self.preprocess_text(raw_text)
156
+
157
+ if len(processed_text) < 50:
158
+ st.warning("The extracted text is very short. Please check if the PDF contains readable text.")
159
+
160
+ return processed_text
src/modules/text_summarizer.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text Summarization Module
3
+ Handles text summarization using Hugging Face Transformers.
4
+ """
5
+
6
+ from transformers import pipeline, AutoTokenizer
7
+ import torch
8
+ from typing import List, Optional
9
+ import streamlit as st
10
+ import re
11
+
12
+ class TextSummarizer:
13
+ """Class to handle text summarization using pre-trained models"""
14
+
15
+ def __init__(self, model_name: str = "facebook/bart-large-cnn"):
16
+ """
17
+ Initialize the text summarizer
18
+
19
+ Args:
20
+ model_name: Name of the pre-trained model to use
21
+ """
22
+ self.model_name = model_name
23
+ self.summarizer = None
24
+ self.tokenizer = None
25
+ self.max_chunk_length = 1024 # Maximum tokens per chunk
26
+ self.min_summary_length = 50
27
+ self.max_summary_length = 300
28
+
29
+ @st.cache_resource
30
+ def load_model(_self):
31
+ """
32
+ Load the summarization model and tokenizer
33
+ """
34
+ try:
35
+ # Check if CUDA is available
36
+ device = 0 if torch.cuda.is_available() else -1
37
+
38
+ # Show device info
39
+ if torch.cuda.is_available():
40
+ st.info(f"πŸš€ Using GPU acceleration: {torch.cuda.get_device_name()}")
41
+ else:
42
+ st.info("πŸ’» Using CPU for processing (this may be slower)")
43
+
44
+ # Load the summarization pipeline
45
+ _self.summarizer = pipeline(
46
+ "summarization",
47
+ model=_self.model_name,
48
+ device=device,
49
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
50
+ )
51
+
52
+ # Load tokenizer for text chunking
53
+ _self.tokenizer = AutoTokenizer.from_pretrained(_self.model_name)
54
+
55
+ st.success(f"βœ… Model loaded successfully: {_self.model_name}")
56
+ return True
57
+
58
+ except OSError as e:
59
+ if "Connection error" in str(e) or "timeout" in str(e).lower():
60
+ st.error("❌ Network error: Could not download the model. Please check your internet connection.")
61
+ else:
62
+ st.error(f"❌ Model loading error: {str(e)}")
63
+ return False
64
+ except RuntimeError as e:
65
+ if "CUDA" in str(e):
66
+ st.error("❌ GPU memory error. Trying to use CPU instead...")
67
+ try:
68
+ _self.summarizer = pipeline(
69
+ "summarization",
70
+ model=_self.model_name,
71
+ device=-1, # Force CPU
72
+ torch_dtype=torch.float32
73
+ )
74
+ _self.tokenizer = AutoTokenizer.from_pretrained(_self.model_name)
75
+ st.success("βœ… Model loaded successfully on CPU")
76
+ return True
77
+ except Exception as cpu_e:
78
+ st.error(f"❌ Failed to load model on CPU: {str(cpu_e)}")
79
+ return False
80
+ else:
81
+ st.error(f"❌ Runtime error loading model: {str(e)}")
82
+ return False
83
+ except Exception as e:
84
+ st.error(f"❌ Unexpected error loading model: {str(e)}")
85
+ return False
86
+
87
+ def chunk_text(self, text: str) -> List[str]:
88
+ """
89
+ Split long text into smaller chunks for processing
90
+
91
+ Args:
92
+ text: Input text to chunk
93
+
94
+ Returns:
95
+ List[str]: List of text chunks
96
+ """
97
+ if not self.tokenizer:
98
+ # Fallback chunking by sentences if tokenizer not available
99
+ sentences = re.split(r'[.!?]+', text)
100
+ chunks = []
101
+ current_chunk = ""
102
+
103
+ for sentence in sentences:
104
+ if len(current_chunk) + len(sentence) < 2000: # Rough character limit
105
+ current_chunk += sentence + ". "
106
+ else:
107
+ if current_chunk:
108
+ chunks.append(current_chunk.strip())
109
+ current_chunk = sentence + ". "
110
+
111
+ if current_chunk:
112
+ chunks.append(current_chunk.strip())
113
+
114
+ return chunks
115
+
116
+ # Use tokenizer for precise chunking
117
+ tokens = self.tokenizer.encode(text)
118
+ chunks = []
119
+
120
+ for i in range(0, len(tokens), self.max_chunk_length):
121
+ chunk_tokens = tokens[i:i + self.max_chunk_length]
122
+ chunk_text = self.tokenizer.decode(chunk_tokens, skip_special_tokens=True)
123
+ chunks.append(chunk_text)
124
+
125
+ return chunks
126
+
127
+ def summarize_chunk(self, chunk: str) -> Optional[str]:
128
+ """
129
+ Summarize a single text chunk
130
+
131
+ Args:
132
+ chunk: Text chunk to summarize
133
+
134
+ Returns:
135
+ str: Summary of the chunk or None if summarization fails
136
+ """
137
+ try:
138
+ # Adjust summary length based on chunk length
139
+ chunk_length = len(chunk.split())
140
+ max_length = min(self.max_summary_length, max(self.min_summary_length, chunk_length // 3))
141
+ min_length = min(self.min_summary_length, max_length // 2)
142
+
143
+ summary = self.summarizer(
144
+ chunk,
145
+ max_length=max_length,
146
+ min_length=min_length,
147
+ do_sample=False,
148
+ truncation=True
149
+ )
150
+
151
+ return summary[0]['summary_text']
152
+
153
+ except Exception as e:
154
+ st.warning(f"Error summarizing chunk: {str(e)}")
155
+ return None
156
+
157
+ def format_as_bullets(self, summary_text: str) -> str:
158
+ """
159
+ Format summary text as bullet points
160
+
161
+ Args:
162
+ summary_text: Raw summary text
163
+
164
+ Returns:
165
+ str: Formatted bullet points
166
+ """
167
+ # Split by sentences and create bullet points
168
+ sentences = re.split(r'[.!?]+', summary_text)
169
+ bullets = []
170
+
171
+ for sentence in sentences:
172
+ sentence = sentence.strip()
173
+ if sentence and len(sentence) > 10: # Filter out very short fragments
174
+ bullets.append(f"β€’ {sentence}")
175
+
176
+ return '\n'.join(bullets)
177
+
178
+ def summarize_text(self, text: str) -> Optional[str]:
179
+ """
180
+ Complete text summarization pipeline
181
+
182
+ Args:
183
+ text: Input text to summarize
184
+
185
+ Returns:
186
+ str: Formatted summary or None if summarization fails
187
+ """
188
+ if not text or len(text.strip()) < 100:
189
+ st.error("❌ Text is too short to summarize effectively (minimum 100 characters required)")
190
+ return None
191
+
192
+ # Check text length limits
193
+ word_count = len(text.split())
194
+ if word_count > 10000:
195
+ st.warning(f"⚠️ Large text detected ({word_count:,} words). Processing may take several minutes.")
196
+
197
+ try:
198
+ # Load model if not already loaded
199
+ if not self.summarizer:
200
+ with st.spinner("πŸ€– Loading AI model..."):
201
+ if not self.load_model():
202
+ return None
203
+
204
+ # Chunk the text
205
+ chunks = self.chunk_text(text)
206
+
207
+ if len(chunks) == 0:
208
+ st.error("❌ Could not process the text into chunks")
209
+ return None
210
+
211
+ st.info(f"πŸ“„ Processing {len(chunks)} text chunk(s)...")
212
+
213
+ # Summarize each chunk
214
+ summaries = []
215
+ progress_bar = st.progress(0)
216
+ failed_chunks = 0
217
+
218
+ for i, chunk in enumerate(chunks):
219
+ try:
220
+ with st.spinner(f"πŸ”„ Summarizing part {i+1} of {len(chunks)}..."):
221
+ chunk_summary = self.summarize_chunk(chunk)
222
+ if chunk_summary:
223
+ summaries.append(chunk_summary)
224
+ else:
225
+ failed_chunks += 1
226
+ except Exception as e:
227
+ st.warning(f"⚠️ Failed to summarize chunk {i+1}: {str(e)}")
228
+ failed_chunks += 1
229
+ continue
230
+
231
+ progress_bar.progress((i + 1) / len(chunks))
232
+
233
+ # Check if we have any successful summaries
234
+ if not summaries:
235
+ st.error("❌ Could not generate any summaries from the text")
236
+ return None
237
+
238
+ if failed_chunks > 0:
239
+ st.warning(f"⚠️ {failed_chunks} out of {len(chunks)} chunks failed to process")
240
+
241
+ # Combine summaries
242
+ combined_summary = ' '.join(summaries)
243
+
244
+ # If we have multiple chunks, summarize the combined summary
245
+ if len(chunks) > 1 and len(combined_summary.split()) > 200:
246
+ try:
247
+ with st.spinner("πŸ”„ Creating final summary..."):
248
+ final_summary = self.summarize_chunk(combined_summary)
249
+ if final_summary:
250
+ combined_summary = final_summary
251
+ except Exception as e:
252
+ st.warning(f"⚠️ Could not create final summary, using combined chunks: {str(e)}")
253
+
254
+ # Format as bullet points
255
+ formatted_summary = self.format_as_bullets(combined_summary)
256
+
257
+ if not formatted_summary.strip():
258
+ st.error("❌ Generated summary is empty")
259
+ return None
260
+
261
+ return formatted_summary
262
+
263
+ except MemoryError:
264
+ st.error("❌ Out of memory. Please try with a shorter text or restart the application.")
265
+ return None
266
+ except Exception as e:
267
+ st.error(f"❌ Unexpected error during summarization: {str(e)}")
268
+ return None
src/modules/utils.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for the AI Notes Summarizer application
3
+ """
4
+
5
+ import logging
6
+ import streamlit as st
7
+ from typing import Optional
8
+ import re
9
+
10
+ def setup_logging():
11
+ """Setup logging configuration"""
12
+ logging.basicConfig(
13
+ level=logging.INFO,
14
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
15
+ handlers=[
16
+ logging.FileHandler('app.log'),
17
+ logging.StreamHandler()
18
+ ]
19
+ )
20
+ return logging.getLogger(__name__)
21
+
22
+ def validate_input(text: str, min_length: int = 50) -> bool:
23
+ """
24
+ Validate input text
25
+
26
+ Args:
27
+ text: Input text to validate
28
+ min_length: Minimum required length
29
+
30
+ Returns:
31
+ bool: True if valid, False otherwise
32
+ """
33
+ if not text or not text.strip():
34
+ st.error("Please provide some text content")
35
+ return False
36
+
37
+ if len(text.strip()) < min_length:
38
+ st.error(f"Text is too short. Please provide at least {min_length} characters.")
39
+ return False
40
+
41
+ return True
42
+
43
+ def clean_text(text: str) -> str:
44
+ """
45
+ Clean and normalize text content
46
+
47
+ Args:
48
+ text: Raw text content
49
+
50
+ Returns:
51
+ str: Cleaned text
52
+ """
53
+ if not text:
54
+ return ""
55
+
56
+ # Remove excessive whitespace
57
+ text = re.sub(r'\s+', ' ', text)
58
+
59
+ # Remove special characters but keep punctuation
60
+ text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/]', ' ', text)
61
+
62
+ # Clean up multiple spaces
63
+ text = ' '.join(text.split())
64
+
65
+ return text.strip()
66
+
67
+ def format_file_size(size_bytes: int) -> str:
68
+ """
69
+ Format file size in human readable format
70
+
71
+ Args:
72
+ size_bytes: Size in bytes
73
+
74
+ Returns:
75
+ str: Formatted size string
76
+ """
77
+ if size_bytes < 1024:
78
+ return f"{size_bytes} B"
79
+ elif size_bytes < 1024 * 1024:
80
+ return f"{size_bytes / 1024:.1f} KB"
81
+ else:
82
+ return f"{size_bytes / (1024 * 1024):.1f} MB"
83
+
84
+ def display_summary_stats(original_text: str, summary: str):
85
+ """
86
+ Display statistics about the summarization
87
+
88
+ Args:
89
+ original_text: Original input text
90
+ summary: Generated summary
91
+ """
92
+ original_words = len(original_text.split())
93
+ summary_words = len(summary.split())
94
+ compression_ratio = (1 - summary_words / original_words) * 100 if original_words > 0 else 0
95
+
96
+ col1, col2, col3 = st.columns(3)
97
+
98
+ with col1:
99
+ st.metric("Original Words", f"{original_words:,}")
100
+
101
+ with col2:
102
+ st.metric("Summary Words", f"{summary_words:,}")
103
+
104
+ with col3:
105
+ st.metric("Compression", f"{compression_ratio:.1f}%")
106
+
107
+ def create_download_link(content: str, filename: str = "summary.txt") -> str:
108
+ """
109
+ Create a download link for the summary
110
+
111
+ Args:
112
+ content: Content to download
113
+ filename: Name of the file
114
+
115
+ Returns:
116
+ str: Download link HTML
117
+ """
118
+ import base64
119
+
120
+ b64 = base64.b64encode(content.encode()).decode()
121
+ href = f'<a href="data:text/plain;base64,{b64}" download="{filename}">Download Summary</a>'
122
+ return href
src/setup.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Setup script for AI Notes Summarizer
3
+ """
4
+
5
+ from setuptools import setup, find_packages
6
+
7
+ with open("README.md", "r", encoding="utf-8") as fh:
8
+ long_description = fh.read()
9
+
10
+ with open("requirements.txt", "r", encoding="utf-8") as fh:
11
+ requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
12
+
13
+ setup(
14
+ name="ai-notes-summarizer",
15
+ version="1.0.0",
16
+ author="AI Notes Summarizer",
17
+ description="A web application for AI-powered document summarization",
18
+ long_description=long_description,
19
+ long_description_content_type="text/markdown",
20
+ packages=find_packages(),
21
+ classifiers=[
22
+ "Development Status :: 4 - Beta",
23
+ "Intended Audience :: Education",
24
+ "Intended Audience :: End Users/Desktop",
25
+ "License :: OSI Approved :: MIT License",
26
+ "Operating System :: OS Independent",
27
+ "Programming Language :: Python :: 3",
28
+ "Programming Language :: Python :: 3.8",
29
+ "Programming Language :: Python :: 3.9",
30
+ "Programming Language :: Python :: 3.10",
31
+ "Programming Language :: Python :: 3.11",
32
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
33
+ "Topic :: Text Processing :: Linguistic",
34
+ ],
35
+ python_requires=">=3.8",
36
+ install_requires=requirements,
37
+ entry_points={
38
+ "console_scripts": [
39
+ "ai-notes-summarizer=app:main",
40
+ ],
41
+ },
42
+ )
src/test_basic.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Basic tests for AI Notes Summarizer modules
4
+ """
5
+
6
+ import sys
7
+ import os
8
+
9
+ # Add the current directory to Python path
10
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
11
+
12
+ def test_imports():
13
+ """Test if all modules can be imported"""
14
+ print("Testing module imports...")
15
+
16
+ try:
17
+ from modules.pdf_processor import PDFProcessor
18
+ print("βœ… PDF Processor imported successfully")
19
+ except ImportError as e:
20
+ print(f"❌ Failed to import PDF Processor: {e}")
21
+ return False
22
+
23
+ try:
24
+ from modules.text_summarizer import TextSummarizer
25
+ print("βœ… Text Summarizer imported successfully")
26
+ except ImportError as e:
27
+ print(f"❌ Failed to import Text Summarizer: {e}")
28
+ return False
29
+
30
+ try:
31
+ from modules.utils import setup_logging, validate_input
32
+ print("βœ… Utils imported successfully")
33
+ except ImportError as e:
34
+ print(f"❌ Failed to import Utils: {e}")
35
+ return False
36
+
37
+ return True
38
+
39
+ def test_pdf_processor():
40
+ """Test PDF processor basic functionality"""
41
+ print("\nTesting PDF Processor...")
42
+
43
+ try:
44
+ from modules.pdf_processor import PDFProcessor
45
+ processor = PDFProcessor()
46
+
47
+ # Test text preprocessing
48
+ test_text = "This is a test\n\nwith multiple spaces\nand newlines."
49
+ cleaned = processor.preprocess_text(test_text)
50
+ print(f"βœ… Text preprocessing works: '{cleaned}'")
51
+
52
+ return True
53
+ except Exception as e:
54
+ print(f"❌ PDF Processor test failed: {e}")
55
+ return False
56
+
57
+ def test_text_summarizer():
58
+ """Test text summarizer basic functionality"""
59
+ print("\nTesting Text Summarizer...")
60
+
61
+ try:
62
+ from modules.text_summarizer import TextSummarizer
63
+ summarizer = TextSummarizer()
64
+
65
+ # Test text chunking without model loading
66
+ test_text = "This is a test sentence. " * 100
67
+ chunks = summarizer.chunk_text(test_text)
68
+ print(f"βœ… Text chunking works: {len(chunks)} chunks created")
69
+
70
+ # Test bullet formatting
71
+ test_summary = "This is the first point. This is the second point. This is the third point."
72
+ bullets = summarizer.format_as_bullets(test_summary)
73
+ print(f"βœ… Bullet formatting works:\n{bullets}")
74
+
75
+ return True
76
+ except Exception as e:
77
+ print(f"❌ Text Summarizer test failed: {e}")
78
+ return False
79
+
80
+ def test_utils():
81
+ """Test utility functions"""
82
+ print("\nTesting Utils...")
83
+
84
+ try:
85
+ from modules.utils import validate_input, clean_text, format_file_size
86
+
87
+ # Test input validation
88
+ valid = validate_input("This is a test text that is long enough to pass validation.")
89
+ print(f"βœ… Input validation works: {valid}")
90
+
91
+ # Test text cleaning
92
+ dirty_text = "This has multiple spaces and special@#$%characters!"
93
+ clean = clean_text(dirty_text)
94
+ print(f"βœ… Text cleaning works: '{clean}'")
95
+
96
+ # Test file size formatting
97
+ size_str = format_file_size(1024 * 1024)
98
+ print(f"βœ… File size formatting works: {size_str}")
99
+
100
+ return True
101
+ except Exception as e:
102
+ print(f"❌ Utils test failed: {e}")
103
+ return False
104
+
105
+ def main():
106
+ """Run all tests"""
107
+ print("πŸ§ͺ Running Basic Tests for AI Notes Summarizer\n")
108
+
109
+ tests = [
110
+ test_imports,
111
+ test_pdf_processor,
112
+ test_text_summarizer,
113
+ test_utils
114
+ ]
115
+
116
+ passed = 0
117
+ total = len(tests)
118
+
119
+ for test in tests:
120
+ if test():
121
+ passed += 1
122
+ print()
123
+
124
+ print(f"πŸ“Š Test Results: {passed}/{total} tests passed")
125
+
126
+ if passed == total:
127
+ print("πŸŽ‰ All tests passed! The application is ready to run.")
128
+ return True
129
+ else:
130
+ print("⚠️ Some tests failed. Please check the errors above.")
131
+ return False
132
+
133
+ if __name__ == "__main__":
134
+ success = main()
135
+ sys.exit(0 if success else 1)