Alphin Jain commited on
Commit
334c1a6
·
0 Parent(s):

First commit

Browse files
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Karan Verma
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Expedition-Aya-Insight
2
+ AI-Powered Multilingual Scientific Summarization with Cohere
3
+ Aya-Insight is a fast, multilingual AI tool that extracts structured, reasoning-driven insights from scientific research papers.
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ import os
4
+ import sys
5
+ import logging
6
+
7
+ # Add the project root to the sys.path to allow importing modules like config, document_processing, etc.
8
+ # This assumes app.py is in the project root directory.
9
+ # Adjust the path if your app.py is in a subdirectory.
10
+
11
+ try:
12
+ # Import the necessary functions from your main script print("YES1")
13
+ from main import process_uploaded_files, setup_retrieval_system, summarize_extracted_documents
14
+ # Configure Streamlit's logging to match your application's settings
15
+ logging.basicConfig(level='INFO', format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
16
+ logger = logging.getLogger(__name__)
17
+ logger.info("Streamlit app started and logging configured.")
18
+
19
+ # Flag to check if modules were imported successfully
20
+ modules_loaded = True
21
+
22
+ except ImportError as e:
23
+ st.error(f"Could not import application modules. Please ensure your project structure is correct and dependencies are installed.")
24
+ st.error(f"ImportError: {e}")
25
+ logger = logging.getLogger(__name__)
26
+ logger.error(f"Failed to import application modules: {e}", exc_info=True)
27
+ modules_loaded = False # Set flag to False if imports fail
28
+
29
+
30
+ # --- Streamlit App Configuration ---
31
+ st.set_page_config(
32
+ page_title="Aya Insight Document Summarizer",
33
+ page_icon="📄",
34
+ layout="wide"
35
+ )
36
+
37
+ # --- Session State Initialization ---
38
+ # Initialize session state variables if they don't exist
39
+ if 'api_key_entered' not in st.session_state:
40
+ st.session_state.api_key_entered = False
41
+ if 'summary_results' not in st.session_state:
42
+ st.session_state.summary_results = None
43
+ if 'selected_filename' not in st.session_state:
44
+ st.session_state.selected_filename = None
45
+
46
+
47
+ # --- API Key Input Section ---
48
+ if not st.session_state.api_key_entered:
49
+ st.title("🔒 Enter Your Cohere API Key to Unlock")
50
+ api_key = st.text_input("Cohere API Key", type="password", help="Enter your Cohere API key to use the summarization service.")
51
+
52
+ if st.button("Unlock"):
53
+ if api_key:
54
+ # Basic validation: Just check if it's not empty.
55
+ # For a real application, you might want to validate by making a small API call.
56
+ os.environ["COHERE_API_KEY"] = api_key # Set the environment variable
57
+ st.session_state.api_key_entered = True
58
+ st.success("API Key accepted. You can now upload documents.")
59
+ st.rerun() # Rerun the app to show the main content
60
+ else:
61
+ st.warning("Please enter your Cohere API key.")
62
+
63
+ # --- Main Application Content (Unlocked) ---
64
+ if st.session_state.api_key_entered and modules_loaded:
65
+ st.title("📄 Aya Insight Document Summarizer")
66
+ st.markdown("""
67
+ Upload one or more PDF or image files to get a structured summary for each document.
68
+ """)
69
+
70
+ # --- File Uploader ---
71
+ uploaded_files = st.file_uploader(
72
+ "Choose Document Files",
73
+ type=["pdf", "png", "jpg", "jpeg", "tiff", "bmp", "gif"], # Added image types
74
+ accept_multiple_files=True,
75
+ help="You can upload multiple PDF or image documents here."
76
+ )
77
+
78
+ # --- Summarize Button and Logic ---
79
+ if uploaded_files: # Only show button if files are uploaded
80
+ st.info(f"You have uploaded {len(uploaded_files)} file(s).")
81
+
82
+ if st.button("Generate Summaries", key="summarize_button"):
83
+ st.session_state.selected_filename = None # Reset selected file on new summary generation
84
+ if not uploaded_files:
85
+ st.warning("Please upload at least one file before generating summaries.")
86
+ else:
87
+ st.subheader("Processing Documents...")
88
+ all_summary_results = [] # To store results for display
89
+
90
+ # Use a spinner to indicate processing
91
+ with st.spinner("Processing documents and generating summaries... This may take a few minutes depending on file size and number."):
92
+ try:
93
+ # Step 1: Process uploaded files (Extraction)
94
+ logger.info(f"Calling process_uploaded_files with {len(uploaded_files)} files.")
95
+ extraction_results = process_uploaded_files(uploaded_files)
96
+ logger.info(f"Finished document extraction. {len(extraction_results)} results obtained.")
97
+
98
+ # Check if any files were successfully extracted
99
+ if not any(res.get('text') for res in extraction_results):
100
+ st.error("No text could be extracted from the uploaded files. Please check the file formats.")
101
+ logger.error("No text extracted from any uploaded file.")
102
+ st.session_state.summary_results = [] # Store empty results
103
+ # st.stop() # Don't stop, allow user to try again
104
+
105
+ # Step 2: Setup retrieval system (Vector Store and Embedding)
106
+ logger.info("Calling setup_retrieval_system.")
107
+ extraction_results_with_chunks, retriever = setup_retrieval_system(extraction_results)
108
+ logger.info("Retriever system setup complete.")
109
+
110
+ # Step 3: Summarize the extracted documents
111
+ logger.info("Calling summarize_extracted_documents.")
112
+ summary_results = summarize_extracted_documents(extraction_results_with_chunks, retriever)
113
+ logger.info(f"Finished summarization. {len(summary_results)} summary results obtained.")
114
+
115
+ st.session_state.summary_results = summary_results # Store results in session state
116
+
117
+ except FileNotFoundError as fnf_error:
118
+ st.error(f"Configuration Error: {fnf_error}. Please check your environment settings.")
119
+ logger.error(f"Configuration Error during Streamlit process: {fnf_error}", exc_info=True)
120
+ st.session_state.summary_results = [] # Store empty results on error
121
+ except Exception as e:
122
+ st.error(f"An unexpected error occurred during processing: {e}")
123
+ logger.error(f"An unexpected error occurred during Streamlit process: {e}", exc_info=True)
124
+ st.session_state.summary_results = [] # Store empty results on error
125
+
126
+
127
+ # --- Display Document Tiles and Summaries ---
128
+ if st.session_state.summary_results is not None:
129
+ st.subheader("Summaries:")
130
+
131
+ if not st.session_state.summary_results:
132
+ st.info("No summaries were generated. Upload files and click 'Generate Summaries'.")
133
+ else:
134
+ # Display files as a grid of clickable tiles
135
+ files_per_row = 3
136
+ rows = len(st.session_state.summary_results) // files_per_row + (len(st.session_state.summary_results) % files_per_row > 0)
137
+
138
+ # Create a list of filenames for easy access
139
+ filenames = [res.get('filename', f'File {i+1}') for i, res in enumerate(st.session_state.summary_results)]
140
+
141
+ for i in range(rows):
142
+ cols = st.columns(files_per_row)
143
+ for j in range(files_per_row):
144
+ file_index = i * files_per_row + j
145
+ if file_index < len(st.session_state.summary_results):
146
+ result = st.session_state.summary_results[file_index]
147
+ filename = result.get('filename', f'File {file_index+1}')
148
+ is_selected = st.session_state.selected_filename == filename
149
+
150
+ # Create a tile using a button or markdown link
151
+ # Using a button inside a column for simplicity
152
+ with cols[j]:
153
+ # Add a border or highlight if selected
154
+ tile_style = "border: 2px solid lightgrey; padding: 10px; margin: 5px; text-align: center; cursor: pointer;"
155
+ if is_selected:
156
+ tile_style = "border: 2px solid steelblue; padding: 10px; margin: 5px; text-align: center; cursor: pointer; background-color: #e6f3ff;" # Highlight color
157
+
158
+ # Use markdown with HTML to create the clickable tile appearance
159
+ # When clicked, set the selected filename in session state
160
+ st.markdown(
161
+ f"""
162
+ <div style="{tile_style}" onclick="document.getElementById('hidden_button_{file_index}').click()">
163
+ 📄<br>
164
+ <strong>{filename}</strong>
165
+ </div>
166
+ <button id="hidden_button_{file_index}" style="display: none;" onclick="document.getElementById('hidden_button_{file_index}').click()"></button>
167
+ """,
168
+ unsafe_allow_html=True
169
+ )
170
+ # Streamlit buttons don't work directly with markdown clicks like this easily.
171
+ # A simpler approach is to use a standard button and handle the click.
172
+ # Let's use a standard button instead of complex markdown/JS.
173
+
174
+ # Alternative using a standard button:
175
+ if st.button(f"📄 {filename}", key=f"tile_button_{file_index}"):
176
+ st.session_state.selected_filename = filename
177
+ logger.info(f"Selected file: {filename}")
178
+ st.rerun() # Rerun to display the summary
179
+
180
+
181
+ # Display summary of the selected file
182
+ if st.session_state.selected_filename:
183
+ st.markdown("---") # Separator
184
+ st.subheader(f"Summary for: {st.session_state.selected_filename}")
185
+
186
+ # Find the summary for the selected file
187
+ selected_summary = None
188
+ selected_result = None
189
+ for result in st.session_state.summary_results:
190
+ if result.get('filename') == st.session_state.selected_filename:
191
+ selected_summary = result.get('summary')
192
+ selected_result = result
193
+ break
194
+
195
+ if selected_summary:
196
+ if selected_result.get('success'):
197
+ st.markdown(selected_summary) # Render markdown summary
198
+ else:
199
+ st.error(f"Could not load summary for {st.session_state.selected_filename}: {selected_result.get('error', 'Unknown error')}")
200
+ else:
201
+ st.info(f"Summary not available for {st.session_state.selected_filename}.")
202
+
203
+ # Display overall processing status
204
+ successful_count = sum(res.get('success', False) for res in st.session_state.summary_results)
205
+ total_files = len(st.session_state.summary_results)
206
+ st.markdown(f"---") # Final separator
207
+ st.success(f"Processed {total_files} files. Successfully summarized {successful_count}.")
208
+ if successful_count < total_files:
209
+ st.warning("Some files could not be processed or summarized. See error messages above.")
210
+
211
+
212
+ # --- Message if API Key is not entered and modules loaded ---
213
+ if not st.session_state.api_key_entered:
214
+ st.info("Enter your Cohere API Key above to unlock the application functionality.")
215
+
app/__init__.py ADDED
File without changes
app/config/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Configuration package.
3
+ """
app/config/settings.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import logging
4
+
5
+ # Load environment variables from a .env file
6
+ load_dotenv()
7
+
8
+ # Configure logging for the httpx library to suppress warnings
9
+ logging.getLogger("httpx").setLevel(logging.WARNING)
10
+
11
+ # Define default paths for documents and summaries output
12
+ DOCS_FOLDER = os.getenv("DOCS_FOLDER", "samples/pdf5")
13
+ SUMMARIES_OUTPUT_DIR = os.getenv("SUMMARIES_OUTPUT_DIR", "summaries")
14
+
15
+ # Create the output directory if it doesn't exist
16
+ os.makedirs(SUMMARIES_OUTPUT_DIR, exist_ok=True)
17
+
18
+ # Define models for embedding, reranking, and language model
19
+ EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "embed-v4.0")
20
+ COHERERANK_MODEL = os.getenv('COHERERANK_MODEL', 'rerank-v3.5')
21
+ LLM_MODEL = os.getenv("LLM_MODEL", "command-a-03-2025")
22
+
23
+ # Define settings for text splitting and retrieval
24
+ CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1000"))
25
+ CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "100"))
26
+ COHERERANK_TOPN = int(os.getenv("COHERERANK_TOPN", "100"))
27
+ VECTOSTORE_TOPK = int(os.getenv("VECTOSTORE_TOPK", "100"))
app/document_processing/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Document processing package.
3
+ """
app/document_processing/extractors.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document extraction functionality for processing documents.
3
+ """
4
+ import json
5
+ import os
6
+ import concurrent.futures
7
+ import time
8
+
9
+ import cohere
10
+ import logging
11
+ from pathlib import Path
12
+ from typing import List, Dict, Any, Optional
13
+ from langchain.docstore.document import Document
14
+ from ..config.settings import CHUNK_SIZE, LLM_MODEL
15
+
16
+ # Configure logging with a null handler by default
17
+ logger = logging.getLogger(__name__)
18
+ logger.addHandler(logging.NullHandler())
19
+
20
+
21
+ class DocumentProcessor:
22
+ """Base class for document processors"""
23
+
24
+ def __init__(self):
25
+ self.supported_extensions = []
26
+
27
+ def can_process(self, file_path: str) -> bool:
28
+ """Check if the processor can handle this file type"""
29
+ ext = Path(file_path).suffix.lower()
30
+ return ext in self.supported_extensions
31
+
32
+ def process(self, file_path: str, **kwargs) -> str:
33
+ """Process the document and extract text"""
34
+ raise NotImplementedError("Subclasses must implement this method")
35
+
36
+
37
+ class PdfProcessor(DocumentProcessor):
38
+ """Processor for PDF documents"""
39
+
40
+ def __init__(self):
41
+ super().__init__()
42
+ self.supported_extensions = ['.pdf']
43
+
44
+ def process(self, file_path: str, **kwargs) -> str:
45
+ """Extract text from a PDF file"""
46
+ try:
47
+ # Import here to avoid dependency if not used
48
+ from pypdf import PdfReader
49
+
50
+ logger.debug(f"Processing PDF: {file_path}")
51
+ reader = PdfReader(file_path)
52
+ text = ""
53
+ for page in reader.pages:
54
+ text += page.extract_text() + "\n"
55
+ return text.strip()
56
+ except Exception as e:
57
+ logger.error(f"Error processing PDF {file_path}: {e}")
58
+ raise
59
+
60
+
61
+ class ImageProcessor(DocumentProcessor):
62
+ """Processor for image files"""
63
+
64
+ def __init__(self):
65
+ super().__init__()
66
+ self.supported_extensions = ['.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif']
67
+ # Default languages including multiple options
68
+ self.default_languages = "eng+fra+hin+spa+chi-sim"
69
+
70
+ def process(self, file_path: str, **kwargs) -> str:
71
+ """Extract text from an image file using OCR"""
72
+ try:
73
+ # Import here to avoid dependency if not used
74
+ import pytesseract
75
+ from PIL import Image
76
+
77
+ # Use the expanded default languages if not specified
78
+ lang = kwargs.get('lang', self.default_languages)
79
+ logger.debug(f"Processing image: {file_path} with languages: {lang}")
80
+ image = Image.open(file_path)
81
+ text = pytesseract.image_to_string(image, lang=lang)
82
+ return text.strip()
83
+ except Exception as e:
84
+ logger.error(f"Error processing image {file_path}: {e}")
85
+ raise
86
+
87
+
88
+ class DocumentExtractor:
89
+ """Main class for document text extraction"""
90
+
91
+ def __init__(self):
92
+ """Initialize with default processors"""
93
+ self.processors = [
94
+ PdfProcessor(),
95
+ ImageProcessor()
96
+ ]
97
+ self.cohere_client = None
98
+
99
+ def add_processor(self, processor: DocumentProcessor) -> None:
100
+ """Add a custom document processor"""
101
+ self.processors.append(processor)
102
+
103
+ def get_processor(self, file_path: str) -> Optional[DocumentProcessor]:
104
+ """Get the appropriate processor for a file"""
105
+ for processor in self.processors:
106
+ if processor.can_process(file_path):
107
+ return processor
108
+ return None
109
+
110
+ def get_language(self, text: str) -> str:
111
+ """
112
+ Detect the language of the provided text using Cohere API.
113
+
114
+ Args:
115
+ text: Text sample to analyze
116
+
117
+ Returns:
118
+ String containing the detected language name
119
+ """
120
+ try:
121
+ # Initialize client if not already done
122
+ start = time.time()
123
+ if not self.cohere_client:
124
+ self.cohere_client = cohere.Client()
125
+
126
+ prompt = f"What language is this sentence written in?\n\n{text}\n\nRespond only with the language name."
127
+ response = self.cohere_client.chat(
128
+ model=LLM_MODEL,
129
+ message= prompt,
130
+ max_tokens=100,
131
+ temperature=0.2,
132
+ )
133
+ return response.text
134
+
135
+ except Exception as e:
136
+ logger.error(f"Error detecting language: {e}")
137
+ return "unknown"
138
+
139
+ def process_file(self, file_path: str, **kwargs) -> Dict[str, Any]:
140
+ """
141
+ Process a single file based on its extension.
142
+
143
+ Args:
144
+ file_path: Path to the file
145
+ **kwargs: Additional processing options
146
+
147
+ Returns:
148
+ Dictionary containing processing results and metadata
149
+ """
150
+ result = {
151
+ "file_path": file_path,
152
+ "filename": Path(file_path).name,
153
+ "text": "",
154
+ "error": None,
155
+ "type": None,
156
+ "language": None,
157
+ "chunk_size": 0
158
+ }
159
+
160
+ try:
161
+ processor = self.get_processor(file_path)
162
+
163
+ if processor:
164
+ text = processor.process(file_path, **kwargs)
165
+ result["text"] = text
166
+ result["language"] = self.get_language(text[:CHUNK_SIZE]) if text else None
167
+ result["type"] = processor.__class__.__name__.lower().replace('processor', '')
168
+ else:
169
+ ext = Path(file_path).suffix.lower()
170
+ result["error"] = f"Unsupported file type: {ext}"
171
+ except Exception as e:
172
+ result["error"] = str(e)
173
+
174
+ return result
175
+
176
+ def process_files(self, file_paths: List[str], **kwargs) -> List[Dict[str, Any]]:
177
+ """
178
+ Process multiple files in parallel.
179
+
180
+ Args:
181
+ file_paths: List of file paths to process
182
+ **kwargs: Additional processing options
183
+ (max_workers: max number of processes)
184
+
185
+ Returns:
186
+ List of dictionaries with processing results
187
+ """
188
+ max_workers = kwargs.pop('max_workers', os.cpu_count() or 1)
189
+ logger.info(f"Processing {len(file_paths)} files with {max_workers} workers")
190
+
191
+ results = []
192
+ with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers * 2) as executor:
193
+ futures = {
194
+ executor.submit(self.process_file, file_path, **kwargs): file_path
195
+ for file_path in file_paths
196
+ }
197
+
198
+ for future in concurrent.futures.as_completed(futures):
199
+ file_path = futures[future]
200
+ try:
201
+ result = future.result()
202
+ results.append(result)
203
+ except Exception as e:
204
+ logger.error(f"Exception processing {file_path}: {e}")
205
+ results.append({
206
+ "filepath": file_path,
207
+ "filename": Path(file_path).name,
208
+ "text": "",
209
+ "error": str(e),
210
+ "type": None,
211
+ "langugae": None,
212
+ "chunk_size": 0
213
+ })
214
+
215
+ return results
216
+
217
+ def find_supported_files(self, folder_path: str, recursive: bool = True) -> List[str]:
218
+ """
219
+ Get all supported files in a folder.
220
+
221
+ Args:
222
+ folder_path: Path to the folder
223
+ recursive: Whether to include subfolders
224
+
225
+ Returns:
226
+ List of file paths
227
+ """
228
+ # Get all supported extensions from processors
229
+ supported_extensions = []
230
+ for processor in self.processors:
231
+ supported_extensions.extend(processor.supported_extensions)
232
+
233
+ file_paths = []
234
+
235
+ if recursive:
236
+ for root, _, files in os.walk(folder_path):
237
+ for file in files:
238
+ file_path = os.path.join(root, file)
239
+ if Path(file).suffix.lower() in supported_extensions:
240
+ file_paths.append(file_path)
241
+ else:
242
+ for file in os.listdir(folder_path):
243
+ file_path = os.path.join(folder_path, file)
244
+ if os.path.isfile(file_path) and Path(file).suffix.lower() in supported_extensions:
245
+ file_paths.append(file_path)
246
+
247
+ return file_paths
248
+
249
+ def process_folder(self, folder_path: str, recursive: bool = True, **kwargs) -> List[Dict[str, Any]]:
250
+ """
251
+ Process all supported files in a folder.
252
+
253
+ Args:
254
+ folder_path: Path to the folder containing documents
255
+ recursive: Whether to process subfolders recursively
256
+ **kwargs: Additional processing options
257
+
258
+ Returns:
259
+ List of dictionaries with processing results
260
+ """
261
+ file_paths = self.find_supported_files(folder_path, recursive)
262
+ logger.info(f"Found {len(file_paths)} supported files in {folder_path}")
263
+
264
+ return self.process_files(file_paths, **kwargs)
265
+
266
+
267
+ class FileOutputManager:
268
+ """Class for managing output of extracted text"""
269
+
270
+ def __init__(self, output_dir: str = "extracted_texts"):
271
+ """Initialize with output directory"""
272
+ self.output_dir = output_dir
273
+ os.makedirs(output_dir, exist_ok=True)
274
+
275
+ def save_results(self, results: List[Dict[str, Any]]) -> Dict[str, int]:
276
+ """
277
+ Save extracted text to files.
278
+
279
+ Args:
280
+ results: List of processing results
281
+
282
+ Returns:
283
+ Dictionary with counts of successful and failed saves
284
+ """
285
+ stats = {"success": 0, "skipped": 0, "failed": 0}
286
+
287
+ for result in results:
288
+ if not result["text"]:
289
+ stats["skipped"] += 1
290
+ continue
291
+
292
+ try:
293
+ # Create filename with original name + file type
294
+ base_name = Path(result['filename']).stem
295
+ file_type = result.get('type', 'unknown')
296
+ output_filename = f"{base_name}_{file_type}.txt"
297
+
298
+ output_path = os.path.join(self.output_dir, output_filename)
299
+ with open(output_path, "w", encoding="utf-8") as f:
300
+ f.write(result["text"])
301
+ stats["success"] += 1
302
+ except Exception as e:
303
+ logger.error(f"Error saving text from {result['file_path']}: {e}")
304
+ stats["failed"] += 1
305
+
306
+ return stats
307
+
308
+
309
+ # Adapter class to convert DocumentExtractor results to langchain Document objects
310
+ class DocumentProcessorAdapter:
311
+ """
312
+ Adapter to process documents and convert them to langchain Document objects.
313
+ """
314
+ def __init__(self):
315
+ """Initialize document processor adapter with the extractor."""
316
+ self.extractor = DocumentExtractor()
317
+
318
+ def process_folder(self, folder_path):
319
+ """
320
+ Process all documents in a folder.
321
+
322
+ Args:
323
+ folder_path (str): Path to the folder containing documents
324
+
325
+ Returns:
326
+ tuple: (list of langchain Document objects, original extraction results)
327
+ """
328
+ if not os.path.exists(folder_path):
329
+ raise FileNotFoundError(f"Folder not found: {folder_path}")
330
+
331
+ # Extract content from documents
332
+ extraction_results = self.extractor.process_folder(folder_path)
333
+ print(f"Processed {len(extraction_results)} documents")
334
+ return extraction_results
app/retrieval/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Retrieval package.
3
+ """
app/retrieval/vector_store.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Vector database operations for document storage and retrieval.
3
+ """
4
+ from langchain_chroma import Chroma
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain_cohere import CohereEmbeddings
7
+ from langchain_core.documents import Document
8
+
9
+ from ..config.settings import CHUNK_SIZE, CHUNK_OVERLAP, EMBEDDING_MODEL, COHERERANK_MODEL, COHERERANK_TOPN, VECTOSTORE_TOPK
10
+ import cohere
11
+
12
+ class Retriever:
13
+ """
14
+ Wrapper for vector database operations.
15
+ """
16
+
17
+ def __init__(self, model=EMBEDDING_MODEL):
18
+ self.cohere_client = cohere.Client()
19
+ self.chroma_db = None
20
+ self.embedding_model = CohereEmbeddings(model=model)
21
+ self.text_splitter = RecursiveCharacterTextSplitter(
22
+ chunk_size=CHUNK_SIZE,
23
+ chunk_overlap=CHUNK_OVERLAP
24
+ )
25
+
26
+ def create_from_documents(self, extraction_results):
27
+ chunks = []
28
+ for result in extraction_results:
29
+ filename = result['filename']
30
+ text = result['text']
31
+ if text:
32
+ document = Document(
33
+ page_content=text,
34
+ metadata={"filename": filename}
35
+ )
36
+ doc_chunks = self.text_splitter.split_documents([document])
37
+ result['chunk_size'] = len(doc_chunks)
38
+ chunks.extend(doc_chunks)
39
+
40
+ self.chroma_db = Chroma.from_documents(
41
+ chunks,
42
+ embedding=self.embedding_model
43
+ )
44
+ return extraction_results
45
+
46
+ def similarity_search(self, query, k=5, filter=None):
47
+ if not self.chroma_db:
48
+ raise ValueError("Vector store has not been initialized with documents")
49
+
50
+ return self.chroma_db.similarity_search(query=query, k=k, filter=filter)
51
+
52
+ def reranking(self, query, docs, top_n=10):
53
+ doc_texts = [doc.page_content for doc in docs]
54
+ rerank_response = self.cohere_client.rerank(model=COHERERANK_MODEL, query=query, documents=doc_texts, top_n=top_n)
55
+ # return [docs[result.index] for result in rerank_response.results]
56
+ return [docs[result.index].page_content for result in rerank_response.results]
57
+
58
+
59
+ def get_relevant_docs(self, chromdb_query, rerank_query, filter, chunk_size):
60
+ dense_topk = min(chunk_size, VECTOSTORE_TOPK)
61
+ reranking_topk = min(chunk_size, COHERERANK_TOPN)
62
+ docs = self.similarity_search(chromdb_query, filter=filter, k=dense_topk)
63
+ if docs:
64
+ return self.reranking(rerank_query, docs, top_n=reranking_topk)
65
+ return []
66
+
67
+
68
+
app/summarization/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Summarization package.
3
+ """
app/summarization/output.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Output handling for document summaries in multiple formats.
3
+ """
4
+ import os
5
+ from ..config.settings import SUMMARIES_OUTPUT_DIR
6
+
7
+
8
+ class SummaryOutputManager:
9
+ """
10
+ Manager for saving and retrieving document summaries in multiple formats.
11
+ """
12
+
13
+ def __init__(self, output_dir=SUMMARIES_OUTPUT_DIR):
14
+ """
15
+ Initialize output manager with output directory.
16
+
17
+ Args:
18
+ output_dir (str): Directory to save summaries
19
+ """
20
+ self.output_dir = output_dir
21
+ self._ensure_output_dir()
22
+
23
+ def _ensure_output_dir(self):
24
+ """Create output directory if it doesn't exist."""
25
+ if not os.path.exists(self.output_dir):
26
+ os.makedirs(self.output_dir)
27
+ print(f"Created output directory: {self.output_dir}")
28
+
29
+ def save_summary(self, filename, summary, formats=None):
30
+ """
31
+ Save a document summary to files in specified formats.
32
+
33
+ Args:
34
+ filename (str): Name of the original document
35
+ summary (str): Summary text
36
+ formats (list): List of formats to save. Defaults to ['markdown', 'html']
37
+
38
+ Returns:
39
+ dict: Paths to the saved summary files by format
40
+ """
41
+ if formats is None:
42
+ formats = ['markdown', 'html']
43
+
44
+ output_paths = {}
45
+
46
+ # Generate and save in each requested format
47
+ for fmt in formats:
48
+ if fmt == 'markdown':
49
+ output_paths['markdown'] = self._save_markdown(filename, summary)
50
+ elif fmt == 'html':
51
+ output_paths['html'] = self._save_html(filename, summary)
52
+ else:
53
+ print(f"Warning: Unsupported format '{fmt}' requested")
54
+
55
+ return output_paths
56
+
57
+ def _save_markdown(self, filename, summary):
58
+ """
59
+ Save a document summary to a markdown file.
60
+
61
+ Args:
62
+ filename (str): Name of the original document
63
+ summary (str): Summary text
64
+
65
+ Returns:
66
+ str: Path to the saved markdown file
67
+ """
68
+ # Create markdown output
69
+ markdown_content = f""
70
+ markdown_content += summary
71
+ markdown_content += "\n\n---\n"
72
+
73
+ # Save to file
74
+ output_path = os.path.join(self.output_dir, f"{filename}.md")
75
+ with open(output_path, "w") as f:
76
+ f.write(markdown_content)
77
+
78
+ print(f"Saved markdown summary to: {output_path}")
79
+ return output_path
80
+
81
+ def _save_html(self, filename, summary):
82
+ """
83
+ Save a document summary to an HTML file.
84
+
85
+ Args:
86
+ filename (str): Name of the original document
87
+ summary (str): Summary text
88
+
89
+ Returns:
90
+ str: Path to the saved HTML file
91
+ """
92
+ # Convert summary to HTML paragraphs
93
+ paragraphs = summary.split('\n\n')
94
+ html_paragraphs = ''.join([f"<p>{p}</p>" for p in paragraphs if p.strip()])
95
+
96
+ # Create HTML output with basic styling
97
+ html_content = f"""<!DOCTYPE html>
98
+ <html>
99
+ <head>
100
+ <meta charset="UTF-8">
101
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
102
+ <title>Summary for {filename}</title>
103
+ <style>
104
+ body {{
105
+ font-family: Arial, sans-serif;
106
+ line-height: 1.6;
107
+ margin: 0;
108
+ padding: 20px;
109
+ max-width: 800px;
110
+ margin: 0 auto;
111
+ color: #333;
112
+ }}
113
+ h1 {{
114
+ color: #2c3e50;
115
+ border-bottom: 1px solid #eee;
116
+ padding-bottom: 10px;
117
+ }}
118
+ p {{
119
+ margin-bottom: 16px;
120
+ }}
121
+ .footer {{
122
+ margin-top: 30px;
123
+ padding-top: 10px;
124
+ border-top: 1px solid #eee;
125
+ font-size: 0.9em;
126
+ color: #7f8c8d;
127
+ }}
128
+ </style>
129
+ </head>
130
+ <body>
131
+ <h1>Summary for {filename}</h1>
132
+ <div class="content">
133
+ {html_paragraphs}
134
+ </div>
135
+ <div class="footer">
136
+ <p>Generated summary</p>
137
+ </div>
138
+ </body>
139
+ </html>
140
+ """
141
+ # Save to file
142
+ output_path = os.path.join(self.output_dir, f"{filename}.html")
143
+ with open(output_path, "w") as f:
144
+ f.write(html_content)
145
+
146
+ print(f"Saved HTML summary to: {output_path}")
147
+ return output_path
148
+
149
+ def get_available_formats(self, filename):
150
+ """
151
+ Check which formats are available for a given file.
152
+
153
+ Args:
154
+ filename (str): Base filename to check
155
+
156
+ Returns:
157
+ list: Available formats for this file
158
+ """
159
+ available_formats = []
160
+ base_name = os.path.splitext(filename)[0]
161
+
162
+ if os.path.exists(os.path.join(self.output_dir, f"{base_name}.md")):
163
+ available_formats.append('markdown')
164
+ if os.path.exists(os.path.join(self.output_dir, f"{base_name}.html")):
165
+ available_formats.append('html')
166
+
167
+ return available_formats
app/summarization/prompt2.py ADDED
@@ -0,0 +1,627 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Research Paper Summarization: Key Points Extraction for Researchers
3
+ Note: Always return the summary in the same language as the original paper.
4
+ """
5
+
6
+
7
+ # # Mini-Prompt 1: Basic Information
8
+ # basic_info_prompt = """
9
+ # Extract the essential identifying information from this research paper.
10
+ #
11
+ # | Information | Details |
12
+ # |-------------|---------|
13
+ # | Title | [Paper title] |
14
+ # | Authors | [Author names] |
15
+ # | Publication | [Journal/Conference, Year] |
16
+ # | Field | [Research domain] |
17
+ # | Keywords | [Key terms] |
18
+ # | DOI/URL | [Link if available] |
19
+ # """
20
+ #
21
+ # # Mini-Prompt 2: Research Objectives & Abstract
22
+ # objectives_prompt = """
23
+ # Extract the following:
24
+ #
25
+ # 1. Primary research question/objective (2-3 bullet points)
26
+ # 2. Condensed abstract summary (2-3 sentences)
27
+ # 3. Main contributions (3-5 bullet points)
28
+ # 4. Theoretical foundation/frameworks
29
+ #
30
+ # Keep each point short and focused on a single idea.
31
+ # """
32
+ #
33
+ # # Mini-Prompt 3: Methodology Details
34
+ # methodology_prompt = """
35
+ # Extract key methodology details:
36
+ #
37
+ # 1. Base model(s) used
38
+ # 2. Architecture summary
39
+ # 3. Dataset(s) with name, size, and characteristics
40
+ # 4. Experimental setup (conditions, controls, parameters)
41
+ # 5. Implementation details (hardware/software used)
42
+ #
43
+ # Format as a structured table with clear categories.
44
+ # """
45
+ #
46
+ # # Mini-Prompt 4: Key Equations & Technical Approach
47
+ # equations_prompt = """
48
+ # Extract and explain key equations:
49
+ #
50
+ # | Equation | Purpose | Explanation |
51
+ # |----------|---------|-------------|
52
+ # | [Equation 1] | [What it calculates] | [Explanation] |
53
+ # | [Equation 2] | [What it calculates] | [Explanation] |
54
+ #
55
+ # Include 2-3 equations central to the methodology or findings.
56
+ # """
57
+ #
58
+ # # Mini-Prompt 5: Results & Performance
59
+ # results_prompt = """
60
+ # Summarize the performance:
61
+ #
62
+ # 1. 5-7 bullet points for primary findings
63
+ # 2. Performance metrics (accuracy, F1, etc.)
64
+ # 3. Comparison with previous work or baselines
65
+ # 4. Highlights from ablation studies
66
+ #
67
+ # Present performance in a table if applicable:
68
+ #
69
+ # | Metric | Value | Compared to Baseline |
70
+ # |--------|-------|-----------------------|
71
+ # """
72
+ #
73
+ # # Mini-Prompt 6: Critical Analysis
74
+ # analysis_prompt = """
75
+ # Identify strengths and weaknesses:
76
+ #
77
+ # | Strengths | Limitations |
78
+ # |-----------|-------------|
79
+ # | [Point 1] | [Limitation 1] |
80
+ # | [Point 2] | [Limitation 2] |
81
+ # | [Point 3] | [Limitation 3] |
82
+ #
83
+ # Also note:
84
+ # - Key assumptions made
85
+ # - Generalizability of findings
86
+ # - Ethical concerns, if mentioned
87
+ # """
88
+ #
89
+ # # Mini-Prompt 7: Implications & Future Work
90
+ # implications_prompt = """
91
+ # Extract future-oriented content:
92
+ #
93
+ # 1. Research implications (field-level impact)
94
+ # 2. Practical/industry applications
95
+ # 3. Future research directions
96
+ # 4. Unanswered or open questions
97
+ #
98
+ # Use 3-4 concise bullet points.
99
+ # """
100
+ #
101
+ # # Mini-Prompt 8: Executive Summary
102
+ # executive_summary_prompt = """
103
+ # Create a concise executive summary (max 250 words):
104
+ #
105
+ # 1. One-paragraph overview
106
+ # 2. Problem being addressed
107
+ # 3. Approach and methods
108
+ # 4. Key results
109
+ # 5. Significance of the findings
110
+ # """
111
+
112
+
113
+ # basic_info_prompt = """You are extracting basic information from a research paper. Please provide:
114
+ #
115
+ # 1. Paper title
116
+ # 2. Authors and affiliations
117
+ # 3. Publication date and venue/journal
118
+ # 4. DOI/URL if available
119
+ # 5. Citation information
120
+ # 6. Research domain/field
121
+ #
122
+ # Format as a simple table with categories and details."""
123
+ #
124
+ # # Mini-Prompt 2: Research Objectives and Abstract
125
+ # objectives_prompt = """Based on the research paper, please extract:
126
+ #
127
+ # 1. The primary research question/objective (2-3 bullet points)
128
+ # 2. A condensed abstract summary (2-3 sentences capturing the essence)
129
+ # 3. The main contributions (3-5 key bullet points)
130
+ # 4. The theoretical foundation/frameworks underlying the research
131
+ #
132
+ # Keep each bullet point to 1-2 sentences, focused on a single idea."""
133
+ #
134
+ # # Mini-Prompt 3: Methodology Details
135
+ # methodology_prompt = """Extract the key methodological details from the paper:
136
+ #
137
+ # 1. Base model(s) used (if applicable)
138
+ # 2. System/model architecture (concise description)
139
+ # 3. Datasets used (names, sizes, characteristics)
140
+ # 4. Experimental setup (conditions, controls, parameters)
141
+ # 5. Implementation details (hardware, software, computational resources)
142
+ #
143
+ # Present this information in a structured table format."""
144
+ #
145
+ # # Mini-Prompt 4: Key Equations and Technical Approach
146
+ # equations_prompt = """Identify and explain the most important equations and technical approaches:
147
+ #
148
+ # 1. Extract 2-3 key equations/formulations
149
+ # 2. For each equation, explain:
150
+ # - What it calculates
151
+ # - Its purpose in the paper
152
+ # - How it relates to the overall methodology
153
+ #
154
+ # Format as a table with columns for Equation, Purpose, and Explanation."""
155
+ #
156
+ # # Mini-Prompt 5: Results and Performance
157
+ # results_prompt = """Summarize the main results and performance metrics:
158
+ #
159
+ # 1. Primary findings (5-7 bullet points)
160
+ # 2. Performance metrics (accuracy, F1, BLEU, etc.)
161
+ # 3. Comparison to prior or competing approaches
162
+ # 4. Key insights from ablation studies
163
+ #
164
+ # Present performance metrics in a table with columns for Metric, Value, and Comparison to Previous Work."""
165
+ #
166
+ # # Mini-Prompt 6: Critical Analysis
167
+ # analysis_prompt = """Analyze the strengths and limitations of the paper:
168
+ #
169
+ # 1. Clearly stated limitations (3-4 bullet points)
170
+ # 2. Key assumptions made by the authors
171
+ # 3. Assessment of generalizability of findings
172
+ # 4. Ethical considerations mentioned
173
+ #
174
+ # Present as a comparison table with Strengths and Limitations columns."""
175
+ #
176
+ # # Mini-Prompt 7: Implications and Future Work
177
+ # implications_prompt = """Extract information about implications and future directions:
178
+ #
179
+ # 1. Research implications (how this advances the field)
180
+ # 2. Practical/industry applications of the findings
181
+ # 3. Future research directions identified by the authors
182
+ # 4. Unresolved questions that emerge from this work
183
+ #
184
+ # Provide as 3-4 concise bullet points focusing on significance and future work."""
185
+ #
186
+ # # Mini-Prompt 8: Executive Summary
187
+ # executive_summary_prompt = """Create a concise executive summary of the paper with these components:
188
+ #
189
+ # 1. One-paragraph overview (3-5 sentences)
190
+ # 2. The problem being solved
191
+ # 3. The approach taken
192
+ # 4. The key results
193
+ # 5. Why this matters
194
+ #
195
+ # Keep the entire summary under 250 words for quick reference."""
196
+
197
+
198
+ # ================= ================= ================= ================= ================= =================
199
+
200
+
201
+ basic_info_prompt = """# Basic Paper Information
202
+
203
+ Generate a concise summary of the paper's essential metadata using the table below. Ensure all details are accurately extracted and easy for researchers to scan. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
204
+
205
+ | Information | Details |
206
+ |-------------------|----------------------------------|
207
+ | **Title** | [Full title of the paper] |
208
+ | **Authors** | [Complete list of authors] |
209
+ | **Publication Venue** | [Journal/Conference, Year] |
210
+ | **Research Field**| [Primary domain or discipline] |
211
+ | **Keywords** | [Relevant terms and topics - use bullet points if multiple] |
212
+ """
213
+
214
+ research_focus_prompt = """# Core Research Focus
215
+
216
+ Summarize the central aim, problem, contribution, and significance of the paper. Present the information clearly and concisely using bullet points. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
217
+
218
+ * **Research Question:** [What is being investigated? State this clearly.]
219
+ * **Problem Statement:** [What specific gap or issue does the paper address? Be direct.]
220
+ * **Main Contribution:** [What is the core offering, innovation, or finding? Highlight the novelty.]
221
+ * **Significance:** [Why is this research important for the field or practice? Briefly explain the impact.]
222
+ """
223
+
224
+ abstract_prompt = """# Abstract Summary
225
+
226
+ Break down the paper's abstract into its fundamental components for quick comprehension. Present the information concisely using bullet points. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
227
+
228
+ * **Background:** [Brief context leading to the study]
229
+ * **Problem:** [The specific issue the paper tackles]
230
+ * **Methodology:** [Approach, methods, or techniques used]
231
+ * **Key Findings:** [Main results or discoveries - use sub-bullets if needed]
232
+ * **Conclusion:** [Primary takeaway or implication]
233
+ """
234
+
235
+ methods_prompt = """# Methodology Summary
236
+
237
+ Describe how the research was conducted, focusing on key aspects like study design, data, techniques, and evaluation. Present the information concisely using bullet points, with sub-bullets for details. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
238
+
239
+ * **Study Design:** [e.g., Experimental, Simulation, Case Study, etc.]
240
+ * **Dataset(s):**
241
+ * Source: [Where the data came from]
242
+ * Size: [Amount of data]
243
+ * Key Characteristics: [Important features or properties]
244
+ * Preprocessing: [Main steps taken to prepare data]
245
+ * **Techniques/Models:** [Specific models, algorithms, or frameworks used - list key ones]
246
+ * **Evaluation:**
247
+ * Metrics: [How performance/success was measured - list key metrics]
248
+ * Setup: [Briefly describe evaluation setup if notable]
249
+ * **Tools & Software:** [Libraries, platforms, hardware specifics if critical]
250
+ """
251
+
252
+ results_prompt = """# Key Results
253
+
254
+ List and explain the paper's main outcomes and their importance. Use the table for primary findings and bullet points for comparisons to prior work. Keep descriptions and insights brief and impactful. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
255
+
256
+ | Finding # | Description of Result | Significance / Insight |
257
+ |-----------|-------------------------------|----------------------------------|
258
+ | 1 | [What was observed/found?] | [Why this result is important or novel?] |
259
+ | 2 | [Another key result] | [Its implication or contribution] |
260
+ | 3 | [Third main finding] | [What we learn from this] |
261
+ | ... | [Add more rows as needed] | [Corresponding insight] |
262
+
263
+ **Comparison to Prior Work:**
264
+ * [Highlight how these results differ from or improve upon previous research.]
265
+ * [Mention specific previous work if comparison is direct.]
266
+ * [Explain why the improvement or difference matters.]
267
+ """
268
+
269
+ visuals_prompt = """# Important Figures & Tables
270
+
271
+ Highlight the most critical visualizations and tabular data from the paper. Explain their content and why they are important for understanding the research. Use the table below.
272
+
273
+ | Visual Element | Brief Description | Key Insight or Interpretation |
274
+ |-----------------|-------------------------------|--------------------------------------|
275
+ | **Figure [Number]**| [What the figure depicts or shows] | [What key point or data trend does it illustrate?] |
276
+ | **Table [Number]** | [Summary of data/content in the table] | [What conclusion or comparison can be drawn from this table?] |
277
+ | **Figure [Number]**| [Another key visualization] | [Why is this figure crucial for the results or argument?] |
278
+ | ... | [Add more rows as needed] | [Corresponding insight] |
279
+ """
280
+
281
+ limitations_prompt = """# Limitations & Future Work
282
+
283
+ Detail the limitations encountered during the research and outline suggested future directions. Use bullet points for both limitations and future work. Be concise. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
284
+
285
+ **Limitations:**
286
+ * **Theoretical:** [Conceptual limits of the approach, with brief impact]
287
+ * **Methodological:** [Issues with design or procedure, with brief impact]
288
+ * **Data-Related:** [Constraints due to data quality/availability, with brief impact]
289
+ * [Add other relevant limitations]
290
+
291
+ **Future Work Suggestions:**
292
+ * [Proposed next steps or improvements to the current work.]
293
+ * [New areas or questions for future research based on these findings.]
294
+ * [Potential experiments or applications to explore.]
295
+ """
296
+
297
+ contributions_prompt = """# Main Contributions
298
+
299
+ List all major contributions of the paper, categorized by type. Explain how each contribution adds value or novelty to the field. Use bullet points, with sub-bullets for novelty/advancement. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
300
+
301
+ * **Theoretical:** [New framework, concept, or insight introduced]
302
+ * Novelty/Advancement: [How it extends or changes existing theory]
303
+ * **Methodological:** [New method, algorithm, or model developed]
304
+ * Novelty/Advancement: [What makes it different, better, or more efficient?]
305
+ * **Empirical:** [Significant findings or results from experiments/data]
306
+ * Novelty/Advancement: [Why these results matter or what they demonstrate?]
307
+ * **Practical:** [Applications, systems, or tools developed]
308
+ * Novelty/Advancement: [Real-world relevance or utility]
309
+ * [Add other relevant contributions]
310
+
311
+ **Most Noteworthy Contribution:** [Briefly summarize the single biggest impact or most innovative aspect of the paper.]
312
+ """
313
+
314
+ related_work_prompt = """# Related Work
315
+
316
+ Show how this research fits into the existing landscape of studies and what specific gaps it addresses. Use the table to compare this work to previous approaches and list the addressed gap using bullet points. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
317
+
318
+ | Topic/Area | Previous Approaches | This Paper's Innovation / Difference |
319
+ |-----------------|----------------------------------|----------------------------------------|
320
+ | **[Relevant Area 1]**| [Summary of how prior work handled this] | [What new approach, technique, or finding is introduced here?] |
321
+ | **[Relevant Area 2]**| [Other related methods or studies] | [How does this paper build upon or deviate from them?] |
322
+ | **[Relevant Area 3]**| [Existing theories or models] | [Enhancements, alternatives, or validations provided by this work] |
323
+ | ... | [Add more rows as needed] | [Corresponding innovation] |
324
+
325
+ **Gap Addressed:**
326
+ * [What specific problem, limitation, or missing piece in the existing literature does this paper tackle?]
327
+ """
328
+
329
+ applications_prompt = """# Practical Applications
330
+
331
+ Explore potential real-world applications of the research findings or methods. Use the table to detail potential use cases, required conditions, and feasibility. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
332
+
333
+ | Domain/Industry | Potential Use Case or Application | Key Requirements or Dependencies | Feasibility/Timeline (e.g., Short/Med/Long term) |
334
+ |-----------------|--------------------------------------|-------------------------------------|-------------------|
335
+ | **[Domain 1]** | [How can the results/methods be used here?] | [What data, technology, or infrastructure is needed?] | [Estimated time to potential deployment] |
336
+ | **[Domain 2]** | [Another potential application area] | [Factors affecting feasibility or adoption] | [Estimated time to potential deployment] |
337
+ | **[Domain 3]** | [Innovative potential application] | [Challenges or conditions for implementation] | [Estimated time to potential deployment] |
338
+ | ... | [Add more rows as needed] | [Corresponding requirements] | [Corresponding timeline] |
339
+
340
+ **Most Promising Use Case:** [Briefly highlight the application with the highest potential impact or feasibility.]
341
+ """
342
+
343
+ technical_prompt = """# Technical Details
344
+
345
+ Provide a concise summary of the paper's specific technical aspects. Use the table for algorithms, architecture, implementation, and performance. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
346
+
347
+ | Component | Description | Key Configuration or Parameters |
348
+ |-------------------|-------------------------------------|--------------------------------------|
349
+ | **Algorithm(s)** | [What specific algorithm(s) are central?] | [Key hyperparameters, variations used, etc.] |
350
+ | **Model/Architecture**| [Type or design of the model/system] | [Number of layers, components, specific structure details] |
351
+ | **Implementation**| [Languages, key libraries, environment specifics] | [Frameworks used (TensorFlow, PyTorch, etc.), notable dependencies] |
352
+ | **Performance** | [Key performance metrics reported] | [Results achieved (e.g., Accuracy %, F1 score, latency ms)] |
353
+ | ... | [Add more rows as needed] | [Corresponding details] |
354
+
355
+ """
356
+
357
+ quick_summary_prompt = """# Quick Summary
358
+
359
+ Provide a highly concise summary of the entire paper, suitable for a quick grasp of its core message. Include both a brief paragraph and a single-sentence version. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
360
+
361
+ **Brief Summary (3–5 Sentences):**
362
+ [Write a concise summary covering the paper's motivation, core method, main findings, and overall significance.]
363
+
364
+ **One-Sentence Summary:**
365
+ [Write a single, impactful sentence that captures the paper’s most important contribution or finding.]
366
+ """
367
+
368
+ reading_guide_prompt = """# Reading Guide
369
+
370
+ Help researchers quickly navigate the paper by highlighting the most important sections and the key information found within them. Suggest an efficient reading path. Use the table for key sections and bullet points for the reading path. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
371
+
372
+ | Section Name | Key Information or Reason to Focus Here |
373
+ |-------------------|------------------------------------|
374
+ | **[Section Name 1]**| [What is the main idea or critical takeaway from this section?] |
375
+ | **[Section Name 2]**| [Why is this section particularly insightful or important for understanding the work?] |
376
+ | **[Section Name 3]**| [What key details or results are presented here?] |
377
+ | ... | [Add more rows as needed] |
378
+ | **[Conclusion Section]**| [Main takeaways and future implications.] |
379
+
380
+ **Recommended Reading Path:**
381
+ * [Suggest an efficient order to read the key sections for maximum understanding (e.g., Abstract -> Introduction -> Methods (key parts) -> Results (key figures/tables) -> Conclusion).]
382
+ """
383
+
384
+ equations_prompt = """# Key Equations
385
+
386
+ Highlight and explain the major equations presented in the paper. For each equation, describe its purpose, define its variables, and explain its significance to the research. Use the table below. Use LaTeX format ($$...$$ for block, $...$ for inline) for equations. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
387
+
388
+ | Equation | Purpose or Role in the Paper | Why It Matters to the Research |
389
+ |-------------------|-----------------------------------|------------------------------------|
390
+ | $$ [Equation 1] $$ | [What the equation calculates, models, or represents] | [Its role in the method, results, or theory] |
391
+ | $$ [Equation 2] $$ | [Purpose of this equation] | [Its impact on the conclusions or findings] |
392
+ | $$ [Equation 3] $$ | [Purpose of this equation] | [How it supports the overall argument] |
393
+ | ... | [Add more rows as needed] | [Corresponding significance] |
394
+ """
395
+
396
+ executive_summary_prompt = """# Executive Summary
397
+
398
+ Provide a high-level summary of the paper, tailored for research leads, grant reviewers, or collaborators. Focus on the problem, solution, key results, and implications using concise bullet points. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
399
+
400
+ * **Research Problem:** [Clear articulation of the challenge the paper addresses]
401
+ * **Proposed Solution:** [Brief overview of the method, model, or approach introduced]
402
+ * **Major Results:** [Highlights of the most significant findings or achievements - use sub-bullets if needed]
403
+ * **Implications:** [Practical, theoretical, or future impact of the work]
404
+ * **Relevance:** [Why this paper is important and should be paid attention to]
405
+ """
406
+
407
+ # ================= ================= ================= ================= ================= =================
408
+
409
+
410
+
411
+
412
+ # 1. Basic Paper Information
413
+ # basic_info_prompt = """# Basic Paper Information
414
+ #
415
+ # Extract all essential metadata to clearly identify and classify the research paper. Focus on accurately capturing the publication details.
416
+ #
417
+ # | Information | Details |
418
+ # |-------------------|----------------------------------|
419
+ # | Title | [Full title of the paper] |
420
+ # | Authors | [Complete list of authors] |
421
+ # | Publication Venue | [Journal/Conference, Year] |
422
+ # | Research Field | [Primary domain or discipline] |
423
+ # | Keywords | [Relevant terms and topics] |
424
+ # """
425
+ #
426
+ # # 2. Core Research Focus
427
+ # research_focus_prompt = """# Core Research Focus
428
+ #
429
+ # Summarize the central aim of the paper. Clearly articulate the main question, the addressed problem, and the novelty of the contribution.
430
+ #
431
+ # | Element | Details |
432
+ # |----------------------|-------------------------------------------------|
433
+ # | Research Question | [What is being investigated?] |
434
+ # | Problem Statement | [What gap or issue does the paper address?] |
435
+ # | Main Contribution | [What is the core offering or innovation?] |
436
+ # | Significance | [Why is this research important?] |
437
+ # """
438
+ #
439
+ # # 3. Abstract Summary
440
+ # abstract_prompt = """# Abstract Summary
441
+ #
442
+ # Break down the abstract into its fundamental components for easier comprehension.
443
+ #
444
+ # | Component | Details |
445
+ # |---------------|----------------------------------------|
446
+ # | Background | [Brief context of the study] |
447
+ # | Research Problem | [Specific issue the paper solves] |
448
+ # | Methodology | [Approach or technique used] |
449
+ # | Key Findings | [Main results or discoveries] |
450
+ # | Conclusion | [Primary takeaway from the study] |
451
+ # """
452
+ #
453
+ #
454
+ # # 4. Methodology Summary
455
+ # methods_prompt = """# Methodology Summary
456
+ #
457
+ # Describe how the research was conducted, including data, tools, and procedures.
458
+ #
459
+ # | Component | Details |
460
+ # |------------------|-------------------------------------------|
461
+ # | Study Design | [Experimental, simulation, case study etc.] |
462
+ # | Dataset | [Source, size, preprocessing, etc.] |
463
+ # | Techniques Used | [Models, algorithms, or frameworks used] |
464
+ # | Evaluation Metrics| [How success or performance was measured]|
465
+ # | Tools & Software | [Libraries, platforms, hardware specifics]|
466
+ # """
467
+ #
468
+ #
469
+ # # 5. Key Results
470
+ # results_prompt = """# Key Results
471
+ #
472
+ # List and explain the main outcomes, their impact, and how they compare to past work.
473
+ #
474
+ # | Finding # | Description of Result | Significance / Insight |
475
+ # |-----------|-------------------------------|----------------------------------|
476
+ # | 1 | [What was observed] | [Why it matters] |
477
+ # | 2 | [What was observed] | [Why it matters] |
478
+ # | 3 | [What was observed] | [Why it matters] |
479
+ #
480
+ # **Comparison to Prior Work:** [Highlight how these results differ or improve upon previous research.]
481
+ # """
482
+ #
483
+ #
484
+ # # 6. Important Figures & Tables
485
+ # visuals_prompt = """# Important Figures & Tables
486
+ #
487
+ # Highlight the most critical visualizations and tabular data, explaining their importance.
488
+ #
489
+ # | Figure/Table | Description | Insight or Interpretation |
490
+ # |--------------|----------------------------------|--------------------------------------|
491
+ # | Figure 1 | [What it shows] | [Why it's important] |
492
+ # | Table 2 | [Data/content summary] | [What we learn from it] |
493
+ # | Figure 3 | [Trend or structure depicted] | [Significance to conclusions] |
494
+ # """
495
+ #
496
+ #
497
+ # # 7. Limitations & Future Work
498
+ # limitations_prompt = """# Limitations & Future Work
499
+ #
500
+ # Detail the limitations encountered in the research and outline proposed future directions.
501
+ #
502
+ # | Type | Limitation Description | Potential Impact |
503
+ # |----------------|----------------------------------|--------------------------------------|
504
+ # | Theoretical | [Conceptual limits] | [Effect on validity/generalizability]|
505
+ # | Methodological | [Design or procedure issues] | [Effect on robustness] |
506
+ # | Data-Related | [Data quality, availability] | [Effect on conclusions] |
507
+ #
508
+ # **Future Work Suggestions:**
509
+ # - [Proposed improvement or next step]
510
+ # - [New areas to explore]
511
+ # - [Potential experiments or applications]
512
+ # """
513
+ #
514
+ #
515
+ # # 8. Main Contributions
516
+ # contributions_prompt = """# Main Contributions
517
+ #
518
+ # List all major contributions by type, and explain how each adds value.
519
+ #
520
+ # | Category | Contribution Summary | Novelty or Advancement |
521
+ # |----------------|----------------------------------|----------------------------------------|
522
+ # | Theoretical | [New framework or insight] | [How it extends theory] |
523
+ # | Methodological | [New method/model] | [What makes it different or better] |
524
+ # | Empirical | [Results from data/experiments] | [Why they matter] |
525
+ # | Practical | [Applications or systems] | [Real-world relevance] |
526
+ #
527
+ # **Most Noteworthy Contribution:** [Summarize the biggest impact of the paper]
528
+ # """
529
+ #
530
+ # # 9. Related Work
531
+ # related_work_prompt = """# Related Work
532
+ #
533
+ # Show how this research fits into the existing landscape and what gaps it fills.
534
+ #
535
+ # | Topic/Area | Previous Approaches | This Paper's Innovation |
536
+ # |------------------|----------------------------------|----------------------------------------|
537
+ # | Area 1 | [Summary of prior methods] | [What’s new in this work] |
538
+ # | Area 2 | [Prior attempts or models] | [Improvements or alternatives] |
539
+ # | Area 3 | [Old techniques or theories] | [Enhancements introduced here] |
540
+ #
541
+ # **Gap Addressed:** [What missing element or inefficiency this paper tackles]
542
+ # """
543
+ #
544
+ # # 10. Practical Applications
545
+ # applications_prompt = """# Practical Applications
546
+ #
547
+ # Explore how the research can be applied in real-world domains.
548
+ #
549
+ # | Domain/Industry | Use Case or Application | Requirements or Dependencies | Expected Timeline |
550
+ # |------------------|----------------------------------|-------------------------------------|-------------------|
551
+ # | Domain 1 | [What can be done] | [Tech, data, adoption needs] | [Short/Med/Long] |
552
+ # | Domain 2 | [Another use case] | [Feasibility factors] | [Short/Med/Long] |
553
+ # | Domain 3 | [Innovative potential] | [Deployment conditions] | [Short/Med/Long] |
554
+ #
555
+ # **Most Promising Use Case:** [Brief highlight of top application potential]
556
+ # """
557
+ #
558
+ #
559
+ # # 11. Technical Details
560
+ # technical_prompt = """# Technical Details
561
+ #
562
+ # Dive into the specific technical aspects, including algorithms, architecture, and implementation details.
563
+ #
564
+ # | Component | Description | Configuration or Parameters |
565
+ # |---------------|-------------------------------------|--------------------------------------|
566
+ # | Algorithm | [What algorithm is used] | [Hyperparameters, version etc.] |
567
+ # | Model/Architecture | [Type or design used] | [Layers, connections, components] |
568
+ # | Implementation| [Languages, packages, environment] | [Frameworks, hardware specifics] |
569
+ # | Performance | [Observed performance] | [Accuracy, latency, etc.] |
570
+ #
571
+ # **Code Repository:** [Link if available or mention if not provided]
572
+ # """
573
+ #
574
+ #
575
+ # # 12. Quick Summary
576
+ # quick_summary_prompt = """# Quick Summary
577
+ #
578
+ # Provide an overview of the entire paper in both concise and single-sentence formats.
579
+ #
580
+ # **Brief Summary (3–5 Sentences):**
581
+ # [Include motivation, methodology, findings, and significance.]
582
+ #
583
+ # **One-Sentence Summary:**
584
+ # [A compact summary capturing the paper’s core message.]
585
+ # """
586
+ #
587
+ #
588
+ # # 13. Reading Guide
589
+ # reading_guide_prompt = """# Reading Guide
590
+ #
591
+ # Help readers focus on the most insightful sections.
592
+ #
593
+ # | Section Name | Key Information or Reason to Read |
594
+ # |-------------------|------------------------------------|
595
+ # | [Section A] | [Main idea or takeaway] |
596
+ # | [Section B] | [Core implementation detail] |
597
+ # | [Section C] | [Critical results or discussion] |
598
+ #
599
+ # **Recommended Reading Path:** [Suggestion for efficient reading – e.g., skip intro, read methods, then results]
600
+ # """
601
+ #
602
+ #
603
+ # # 14. Key Equations
604
+ # equations_prompt = """# Key Equations
605
+ #
606
+ # Highlight and explain major equations in the paper.
607
+ #
608
+ # | Equation | Purpose | Variable Explanation | Why It Matters |
609
+ # |------------------|-----------------------------------|---------------------------------------|------------------------------------|
610
+ # | [Equation 1] | [What it calculates or models] | [Define each term] | [Its role in the paper] |
611
+ # | [Equation 2] | [Purpose] | [Define each term] | [Impact on method/results] |
612
+ # | [Equation 3] | [Purpose] | [Define each term] | [How it supports the conclusions] |
613
+ # """
614
+ #
615
+ # # 15. Executive Summary
616
+ # executive_summary_prompt = """# Executive Summary
617
+ #
618
+ # Offer a high-level summary tailored for research leads, grant reviewers, or collaborators.
619
+ #
620
+ # | Section | Description |
621
+ # |----------------|--------------------------------------|
622
+ # | Research Problem | [Clear articulation of the challenge] |
623
+ # | Proposed Solution| [Brief on method/model introduced] |
624
+ # | Major Results | [Highlights of key findings] |
625
+ # | Implications | [Practical, theoretical impact] |
626
+ # | Relevance | [Why this paper should be read] |
627
+ # """
app/summarization/summarizer.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from typing import Dict, List
3
+ import cohere
4
+ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
5
+ from ..config.settings import LLM_MODEL
6
+
7
+
8
+ class DocumentSummarizer:
9
+ def __init__(self, retriever, batch_size=4):
10
+ self.batch_size = batch_size
11
+ self.retriever = retriever # Store the retriever here
12
+
13
+ self.cohere_client = cohere.ClientV2()
14
+
15
+ self.components = {
16
+ 'basic_info': "Basic Paper Information",
17
+ 'abstract': "Abstract Summary",
18
+ 'methods': "Methodology Summary",
19
+ 'results': "Key Results",
20
+ 'limitations': "Limitations & Future Work",
21
+ 'related_work': "Related Work",
22
+ 'applications': "Practical Applications",
23
+ 'technical': "Technical Details",
24
+ 'equations': "Key Equations",
25
+ }
26
+
27
+ self.prompts = self._initialize_prompts()
28
+
29
+ # Validate prompts dictionary matches components
30
+ # This helps catch missing prompts or components
31
+ missing_prompts = [comp for comp in self.components if comp not in self.prompts]
32
+ if missing_prompts:
33
+ print(f"Warning: No prompts found for components: {missing_prompts}")
34
+ missing_components = [prompt_key for prompt_key in self.prompts if prompt_key not in self.components]
35
+ if missing_components:
36
+ print(f"Warning: Prompts found for components not in self.components: {missing_components}")
37
+
38
+
39
+ def _initialize_prompts(self):
40
+ # It's better to explicitly import what you need
41
+ from ..summarization.prompt2 import (
42
+ basic_info_prompt, abstract_prompt,
43
+ methods_prompt, results_prompt, visuals_prompt, limitations_prompt,
44
+ contributions_prompt, related_work_prompt, applications_prompt,
45
+ technical_prompt, quick_summary_prompt, reading_guide_prompt, # quick_summary & reading_guide prompts might be needed
46
+ equations_prompt
47
+ )
48
+ return {
49
+ 'basic_info': basic_info_prompt,
50
+ 'abstract': abstract_prompt,
51
+ 'methods': methods_prompt,
52
+ 'results': results_prompt,
53
+ 'limitations': limitations_prompt,
54
+ 'related_work': related_work_prompt,
55
+ 'applications': applications_prompt,
56
+ 'technical': technical_prompt,
57
+ 'equations': equations_prompt,
58
+ }
59
+
60
+ def summarize_text(self, documents: List[Dict], prompt: str, language: str):
61
+ """
62
+ Summarizes the provided documents using the given prompt and language
63
+ via the Cohere Chat API.
64
+ """
65
+ if not documents:
66
+ print("Warning: No documents provided for summarization.")
67
+ return None # Or an empty string, depending on desired behavior
68
+
69
+ # Use the initialized client
70
+ try:
71
+ response = self.cohere_client.chat(
72
+ model=LLM_MODEL,
73
+ documents=documents, # Pass the list of dicts directly
74
+ messages=[
75
+ {"role": "system", "content": f"You are an expert summarization AI. Please respond in {language}."},
76
+ {"role": "user", "content": f"{prompt}"}
77
+ ],
78
+ )
79
+ if response and response.message and response.message.content and response.message.content[0] and response.message.content[0].text:
80
+ return response.message.content[0].text
81
+ else:
82
+ print(f"Warning: Unexpected API response structure for prompt: {prompt[:50]}...")
83
+ return None
84
+
85
+ except Exception as e:
86
+ print(f"Error during Cohere API call: {e}")
87
+ return None
88
+
89
+
90
+ def extract_relevant_documents(self, component: str, filename: str, chunk_size: int):
91
+ """
92
+ Extracts relevant documents for a specific component from the retriever.
93
+ """
94
+ query = f"Analyze the {self.components.get(component, component)} section from the document titled '{filename}'."
95
+ # Use the retriever stored in self.
96
+ # Pass the chunk_size parameter correctly
97
+ try:
98
+ documents = self.retriever.get_relevant_docs(
99
+ chromdb_query=query,
100
+ rerank_query=query,
101
+ filter={'filename': filename},
102
+ chunk_size=chunk_size
103
+ )
104
+ return documents
105
+ except Exception as e:
106
+ print(f"Error during document retrieval for component {component}: {e}")
107
+ return []
108
+
109
+
110
+ def summerize_document(self, filename: str, language: str, chunk_size: int):
111
+ """
112
+ Summarizes a document by processing each component in parallel.
113
+ """
114
+ start_total = time.time()
115
+ components = list(self.components.keys())
116
+ results = {}
117
+ errors = {} # Track errors
118
+
119
+ def process_component(comp):
120
+ comp_start = time.time()
121
+ print(f"Starting processing for component: {comp}")
122
+ try:
123
+ document_chunks = self.extract_relevant_documents(comp, filename, chunk_size)
124
+
125
+ if not document_chunks:
126
+ print(f"No documents found for component: {comp} for file {filename}")
127
+ return comp, None, f"No documents found"
128
+
129
+ prompt = self.prompts.get(comp)
130
+ if not prompt:
131
+ print(f"No prompt defined for component: {comp}")
132
+ return comp, None, f"No prompt defined"
133
+
134
+ # Summarize the retrieved documents
135
+ summary = self.summarize_text(document_chunks, prompt, language)
136
+
137
+ comp_end = time.time()
138
+ print(f"Finished processing for component: {comp}. Time taken: {comp_end - comp_start:.2f} seconds")
139
+ return comp, summary, None # Return comp, result, error
140
+
141
+ except Exception as e:
142
+ comp_end = time.time()
143
+ print(f"Error processing component {comp}: {e}. Time taken: {comp_end - comp_start:.2f} seconds")
144
+ return comp, None, str(e) # Return comp, result, error
145
+
146
+ # Use ThreadPoolExecutor for I/O-bound tasks (API calls)
147
+ # max_workers=None uses a default appropriate for the system
148
+ with ThreadPoolExecutor(max_workers=None) as executor:
149
+ # Submit all component tasks
150
+ future_to_component = {executor.submit(process_component, comp): comp for comp in components}
151
+
152
+ # Process results as they complete
153
+ for future in as_completed(future_to_component):
154
+ comp = future_to_component[future]
155
+ try:
156
+ comp_name, result, error = future.result()
157
+ if result is not None:
158
+ results[comp_name] = result
159
+ elif error:
160
+ errors[comp_name] = error
161
+
162
+ except Exception as exc:
163
+ # This catches exceptions *within* the future's result retrieval, less common
164
+ print(f'{comp} generated an exception: {exc}')
165
+ errors[comp] = str(exc)
166
+
167
+
168
+ end_total = time.time()
169
+ print(f"\n--- Total summarization time for {filename}: {end_total - start_total:.2f} seconds ---\n")
170
+
171
+ # You might want to return both results and errors
172
+ # For simplicity, let's just compile the available results for now
173
+ compiled = self.compile_summary(filename, results)
174
+ # Optionally, add errors to the compiled output or return them separately
175
+ if errors:
176
+ print(f"Components that failed or returned no data: {list(errors.keys())}")
177
+ # You could append error messages to the compiled summary
178
+ # compiled += "\n\n## Processing Errors\n" + "\n".join([f"- {k}: {v}" for k, v in errors.items()])
179
+
180
+
181
+ return compiled
182
+
183
+
184
+ def compile_summary(self, filename: str, results: Dict[str, str]) -> str:
185
+ """
186
+ Compiles a summary for a document by concatenating the results of all requested components.
187
+ Orders sections according to a predefined list.
188
+ """
189
+ # Include all components that might have results, maintaining desired order
190
+ sections_order = [
191
+ 'basic_info', 'abstract',
192
+ 'methods', 'results', 'equations', 'technical',
193
+ 'related_work', 'applications', 'limitations'
194
+ ]
195
+
196
+ lines = [f"# Summary of {filename}", f"Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"]
197
+ for section in sections_order:
198
+ # Only add a section if it was processed and returned a result
199
+ if section in results and results[section]:
200
+ # Use .get with a default in case a component was added to results
201
+ # but not self.components (though validate init helps prevent this)
202
+ title = self.components.get(section, section).title()
203
+ lines.append(f"## {title}\n") # Use ## for subheadings
204
+ lines.append(f"{results[section]}\n")
205
+
206
+ return "\n".join(lines)
app/utils/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Utility package.
3
+ """
app/utils/enviornments.py ADDED
File without changes
app/utils/progress_tracker.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Progress monitoring utilities for tracking parallel processing.
3
+ """
4
+ import time
5
+ import threading
6
+ import logging
7
+ from typing import Dict, List, Any, Set
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class ProgressTracker:
13
+ """
14
+ Tracks progress of parallel document processing tasks.
15
+ """
16
+
17
+ def __init__(self, total_documents, update_interval=5):
18
+ """
19
+ Initialize the progress tracker.
20
+
21
+ Args:
22
+ total_documents: Total number of documents to process
23
+ update_interval: How often to log updates (in seconds)
24
+ """
25
+ self.total = total_documents
26
+ self.completed = 0
27
+ self.failed = 0
28
+ self.in_progress = 0
29
+ self.processed_files = set()
30
+ self.update_interval = update_interval
31
+ self.lock = threading.Lock()
32
+ self.start_time = time.time()
33
+ self.monitor_thread = None
34
+ self.stop_monitoring = threading.Event()
35
+
36
+ def mark_started(self, filename):
37
+ """Mark a document as being processed."""
38
+ with self.lock:
39
+ self.in_progress += 1
40
+ logger.info(f"Started processing: {filename}")
41
+
42
+ def mark_completed(self, filename, success=True):
43
+ """Mark a document as completed."""
44
+ with self.lock:
45
+ self.in_progress -= 1
46
+ if filename not in self.processed_files:
47
+ self.processed_files.add(filename)
48
+ if success:
49
+ self.completed += 1
50
+ else:
51
+ self.failed += 1
52
+
53
+ def get_stats(self):
54
+ """Get current processing statistics."""
55
+ with self.lock:
56
+ elapsed = time.time() - self.start_time
57
+ remaining = self.total - (self.completed + self.failed)
58
+
59
+ # Calculate estimated time remaining
60
+ if self.completed > 0:
61
+ avg_time_per_doc = elapsed / self.completed
62
+ est_remaining = avg_time_per_doc * remaining
63
+ else:
64
+ est_remaining = None
65
+
66
+ return {
67
+ 'total': self.total,
68
+ 'completed': self.completed,
69
+ 'failed': self.failed,
70
+ 'in_progress': self.in_progress,
71
+ 'remaining': remaining,
72
+ 'elapsed_seconds': elapsed,
73
+ 'estimated_remaining_seconds': est_remaining
74
+ }
75
+
76
+ def _format_time(self, seconds):
77
+ """Format seconds as HH:MM:SS."""
78
+ if seconds is None:
79
+ return "unknown"
80
+
81
+ hours, remainder = divmod(int(seconds), 3600)
82
+ minutes, seconds = divmod(remainder, 60)
83
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
84
+
85
+ def _monitor_progress(self):
86
+ """Monitor and log progress periodically."""
87
+ while not self.stop_monitoring.is_set():
88
+ stats = self.get_stats()
89
+
90
+ logger.info(
91
+ f"Progress: {stats['completed']}/{stats['total']} completed, "
92
+ f"{stats['failed']} failed, {stats['in_progress']} in progress | "
93
+ f"Elapsed: {self._format_time(stats['elapsed_seconds'])} | "
94
+ f"Est. remaining: {self._format_time(stats['estimated_remaining_seconds'])}"
95
+ )
96
+
97
+ # Check if we're done
98
+ if stats['completed'] + stats['failed'] >= stats['total']:
99
+ logger.info("All documents processed!")
100
+ break
101
+
102
+ # Wait for next update
103
+ self.stop_monitoring.wait(self.update_interval)
104
+
105
+ def start_monitoring(self):
106
+ """Start background monitoring thread."""
107
+ self.monitor_thread = threading.Thread(target=self._monitor_progress)
108
+ self.monitor_thread.daemon = True
109
+ self.monitor_thread.start()
110
+
111
+ def stop(self):
112
+ """Stop the monitoring thread and report final results."""
113
+ if self.monitor_thread and self.monitor_thread.is_alive():
114
+ self.stop_monitoring.set()
115
+ self.monitor_thread.join(timeout=2.0)
116
+
117
+ # Log final statistics
118
+ stats = self.get_stats()
119
+ logger.info(
120
+ f"Final results: {stats['completed']}/{stats['total']} completed, "
121
+ f"{stats['failed']} failed | "
122
+ f"Total time: {self._format_time(stats['elapsed_seconds'])}"
123
+ )
124
+
125
+ success_rate = (stats['completed'] / stats['total']) * 100 if stats['total'] > 0 else 0
126
+ logger.info(f"Success rate: {success_rate:.2f}%")
127
+
128
+ return stats
main.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging # Import logging
2
+ import os
3
+ import tempfile
4
+ import time
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from typing import List, Dict, Any, Tuple
7
+
8
+ from app.config.settings import DOCS_FOLDER
9
+ # Import classes from the renamed modules
10
+ from app.document_processing.extractors import DocumentProcessorAdapter
11
+ from app.retrieval.vector_store import Retriever
12
+ from app.summarization.output import SummaryOutputManager
13
+ from app.summarization.summarizer import DocumentSummarizer
14
+
15
+ # Configure logging for the main script
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+
20
+ def process_uploaded_files(uploaded_files) -> List[Dict[str, Any]]:
21
+ """
22
+ Processes a list of files uploaded via Streamlit.
23
+ Saves them temporarily into a folder and uses the DocumentProcessorAdapter
24
+ to process that folder.
25
+
26
+ Args:
27
+ uploaded_files: List of Streamlit UploadedFile objects.
28
+ Type hint is omitted here to avoid needing Streamlit import at top level.
29
+
30
+ Returns:
31
+ List of dictionaries with original extraction results, including chunk_size.
32
+ """
33
+ # Import streamlit here, as it's used for st.warning
34
+ import streamlit as st
35
+
36
+ start_time = time.time()
37
+ logger.info(f"Starting processing for {len(uploaded_files)} uploaded files.")
38
+
39
+ # Create a temporary directory to save uploaded files
40
+ # This directory will be automatically cleaned up when the 'with' block exits
41
+ with tempfile.TemporaryDirectory() as tmpdir:
42
+ logger.info(f"Using temporary directory: {tmpdir}")
43
+ # Save all uploaded files into the temporary directory
44
+ for uploaded_file in uploaded_files:
45
+ # Create a safe path within the temporary directory
46
+ # Use uploaded_file.name directly, tempfile handles uniqueness if needed
47
+ file_path = os.path.join(tmpdir, uploaded_file.name)
48
+ # Write the file content to the temporary path
49
+ try:
50
+ with open(file_path, "wb") as f:
51
+ f.write(uploaded_file.getvalue())
52
+ logger.debug(f"Saved uploaded file '{uploaded_file.name}' to '{file_path}'")
53
+ except Exception as e:
54
+ logger.error(f"Error saving uploaded file '{uploaded_file.name}' to temporary directory: {e}", exc_info=True)
55
+ # Log a warning in Streamlit if a file couldn't be saved
56
+ st.warning(f"Could not save uploaded file '{uploaded_file.name}' temporarily. It will be skipped.")
57
+
58
+
59
+ # Use the DocumentProcessorAdapter to process the entire temporary folder
60
+ processor = DocumentProcessorAdapter() # Corrected typo here
61
+ # Call process_folder with the temporary directory path
62
+ extraction_results = processor.process_folder(tmpdir)
63
+ # The process_folder method returns the list of extraction results
64
+
65
+ end_time = time.time()
66
+ logger.info(f"Finished processing uploaded files in {end_time - start_time:.2f} seconds.")
67
+ # The extraction_results list now contains dictionaries with 'filename', 'text', 'error', etc.
68
+ return extraction_results
69
+
70
+
71
+ def setup_retrieval_system(extraction_results: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], Retriever]:
72
+ """
73
+ Sets up the retrieval system (vector store) from extraction results.
74
+
75
+ Args:
76
+ extraction_results: List of dictionaries from document extraction.
77
+ Should contain 'filename' and 'text'.
78
+
79
+ Returns:
80
+ A tuple containing:
81
+ - The updated extraction_results list (with 'chunk_size' populated by Retriever).
82
+ - An initialized Retriever instance.
83
+ """
84
+ start_time = time.time()
85
+ logger.info("Setting up retrieval system.")
86
+ try:
87
+ retriever = Retriever()
88
+ # create_from_documents takes extraction results, chunks text, embeds, and builds the DB.
89
+ # It also updates the extraction_results list with the 'chunk_size' for each document.
90
+ updated_extraction_results = retriever.create_from_documents(extraction_results)
91
+ end_time = time.time()
92
+ logger.info(f"Retriever setup complete in {end_time - start_time:.2f} seconds.")
93
+ return updated_extraction_results, retriever
94
+ except Exception as e:
95
+ end_time = time.time()
96
+ logger.error(f"Error during retrieval system setup: {e}", exc_info=True)
97
+ # If retrieval setup fails, the summarization cannot proceed.
98
+ # Re-raise the exception so the Streamlit app can catch and display it.
99
+ raise
100
+
101
+
102
+ def summarize_extracted_documents(extraction_results: List[Dict[str, Any]], retriever: Retriever) -> List[Dict[str, Any]]:
103
+ """
104
+ Summarizes documents based on extraction results and a configured retriever.
105
+
106
+ Args:
107
+ extraction_results: List of dictionaries from document extraction (should include chunk_size
108
+ populated by setup_retrieval_system).
109
+ retriever: An initialized Retriever instance.
110
+
111
+ Returns:
112
+ A list of dictionaries, each containing the summary result for a file.
113
+ Each dictionary includes:
114
+ - 'filename': The name of the file.
115
+ - 'success': Boolean indicating if summarization was successful.
116
+ - 'summary': The generated summary string (if successful), or None.
117
+ - 'error': An error message string (if not successful), or None.
118
+ - 'processing_time': Time taken for summarization of this file.
119
+ """
120
+ start_time = time.time()
121
+ logger.info(f"Starting summarization for {len(extraction_results)} documents.")
122
+
123
+ # Initialize the summarizer with the retriever
124
+ summarizer = DocumentSummarizer(retriever)
125
+
126
+ results = [] # List to store results for each document
127
+
128
+ # Filter out results that failed extraction or have no text/chunks
129
+ # Summarization requires extracted text and successful chunking (chunk_size > 0)
130
+ summarizable_results = [
131
+ res for res in extraction_results
132
+ if res.get('text') and res.get('chunk_size', 0) > 0 and res.get('error') is None
133
+ ]
134
+ skipped_results = [
135
+ res for res in extraction_results
136
+ if res not in summarizable_results
137
+ ]
138
+
139
+ if skipped_results:
140
+ logger.warning(f"Skipping summarization for {len(skipped_results)} files due to extraction errors or no text/chunks.")
141
+ for res in skipped_results:
142
+ # Add entries for skipped files to the results list
143
+ results.append({
144
+ 'filename': res.get('filename', 'unknown'),
145
+ 'success': False,
146
+ 'summary': None,
147
+ 'error': res.get('error', 'Extraction failed or no text/chunks'),
148
+ 'processing_time': 0, # No summarization time for skipped files
149
+ })
150
+
151
+
152
+ def process_single_summary(result: Dict[str, Any]) -> Dict[str, Any]:
153
+ """Helper function to summarize a single document result."""
154
+ file_start_time = time.time()
155
+ filename = result.get('filename', 'unknown')
156
+ # Use detected language, default to English if detection failed
157
+ language = result.get('language', 'en')
158
+ chunk_size = result.get('chunk_size', 0) # Should be > 0 for summarizable_results
159
+
160
+ logger.info(f"Summarizing document: {filename}")
161
+
162
+ try:
163
+ # Call the summarizer for a single document
164
+ # The summerize_document method handles parallel processing of components internally
165
+ summary = summarizer.summerize_document(filename, language, chunk_size)
166
+
167
+ file_end_time = time.time()
168
+ logger.info(f"Finished summarizing {filename} in {file_end_time - file_start_time:.2f} seconds.")
169
+ return {
170
+ 'filename': filename,
171
+ 'success': True,
172
+ 'summary': summary, # Return the summary string
173
+ 'error': None,
174
+ 'processing_time': file_end_time - file_start_time,
175
+ }
176
+ except Exception as e:
177
+ file_end_time = time.time()
178
+ error_msg = str(e)
179
+ logger.error(f"Error summarizing document {filename}: {e}", exc_info=True)
180
+ return {
181
+ 'filename': filename,
182
+ 'success': False,
183
+ 'summary': None,
184
+ 'error': error_msg,
185
+ 'processing_time': file_end_time - file_start_time,
186
+ }
187
+
188
+ with ThreadPoolExecutor(max_workers=None) as executor: # Adjust max_workers as needed
189
+ # Submit summarizable document results to the executor
190
+ futures = {executor.submit(process_single_summary, res): res['filename'] for res in summarizable_results}
191
+
192
+ # Process results as they complete
193
+ for future in as_completed(futures):
194
+ filename = futures[future]
195
+ try:
196
+ summary_result = future.result()
197
+ results.append(summary_result)
198
+ logger.debug(f"Summary result received for {filename}")
199
+ except Exception as exc:
200
+ # This catches exceptions *within* the future's result retrieval
201
+ logger.error(f"Exception retrieving summary result for {filename}: {exc}", exc_info=True)
202
+ results.append({
203
+ 'filename': filename,
204
+ 'success': False,
205
+ 'summary': None,
206
+ 'error': f"Failed to retrieve result: {exc}",
207
+ 'processing_time': 0, # Can't determine processing time if result retrieval failed
208
+ })
209
+
210
+ end_time = time.time()
211
+ logger.info(f"Finished batch summarization in {end_time - start_time:.2f} seconds.")
212
+ return results
213
+
214
+
215
+ # if __name__ == "__main__":
216
+ # start_time = time.time()
217
+ # logger.info("Starting document summarization process (command line).")
218
+ #
219
+ # try:
220
+ # # Step 1: Process documents from the predefined folder
221
+ # logger.info(f"Processing documents from: {DOCS_FOLDER}")
222
+ # # DocumentProcessorAdapter().process_folder returns a list of extraction result dicts
223
+ # extraction_results = DocumentProcessorAdapter().process_folder(DOCS_FOLDER)
224
+ # logger.info(f"Document Processing Time taken: {time.time()-start_time:.2f} seconds")
225
+ #
226
+ # # Step 2: Setup retrieval system
227
+ # setup_start_time = time.time()
228
+ # # setup_retrieval_system takes extraction results and returns updated results (with chunk_size) and the retriever
229
+ # extraction_results_with_chunks, retriever = setup_retrieval_system(extraction_results)
230
+ # logger.info(f"Retriever Setup Time taken: {time.time() - setup_start_time:.2f} seconds")
231
+ #
232
+ # # Step 3: Summarize the documents
233
+ # summarization_start_time = time.time()
234
+ # # For command line, we might still want to save files locally
235
+ # output_manager = SummaryOutputManager() # Uses default output_dir from settings
236
+ # # summarize_extracted_documents performs the summarization and returns results
237
+ # summary_results = summarize_extracted_documents(extraction_results_with_chunks, retriever)
238
+ #
239
+ # # Step 4: Save summaries to files (for command-line only)
240
+ # logger.info("Saving summaries to files.")
241
+ # saved_count = 0
242
+ # for res in summary_results:
243
+ # if res['success'] and res['summary']:
244
+ # # Use the output_manager to save the summary string
245
+ # output_manager.save_summary(res['filename'], res['summary'], formats=['markdown'])
246
+ # saved_count += 1
247
+ # logger.info(f"Saved {saved_count} summaries.")
248
+ #
249
+ #
250
+ # logger.info(f"Summarization Time taken: {time.time() - summarization_start_time:.2f} seconds")
251
+ #
252
+ #
253
+ # # Output results summary to console
254
+ # logger.info("\n" + "=" * 50)
255
+ # logger.info("Summarization Process Complete.")
256
+ # logger.info("=" * 50)
257
+ # successful_count = sum(res.get('success', False) for res in summary_results)
258
+ # total_processed = len(summary_results) # Includes skipped files if they were added to results list earlier
259
+ # total_time = time.time() - start_time
260
+ #
261
+ # logger.info(f"Total files attempted: {len(extraction_results)}") # Total files found/attempted extraction
262
+ # logger.info(f"Files successfully extracted and summarizable: {len(extraction_results_with_chunks)}") # Files with text and chunks
263
+ # logger.info(f"Files summarized: {successful_count}/{total_processed}")
264
+ # logger.info(f"Total process time: {total_time:.2f} seconds")
265
+ # logger.info("=" * 50)
266
+ #
267
+ # # Print individual results status
268
+ # logger.info("\nIndividual File Results:")
269
+ # for result in summary_results:
270
+ # name = result.get('filename', 'unknown')
271
+ # status = "SUCCESS" if result['success'] else "FAILED"
272
+ # time_taken = result.get('processing_time', 0)
273
+ # error_msg = result.get('error', '')
274
+ # logger.info(f"- {name}: {status} ({time_taken:.2f}s) {f'Error: {error_msg}' if error_msg else ''}")
275
+ #
276
+ #
277
+ # except FileNotFoundError as fnf_error:
278
+ # logger.error(f"Configuration Error: {fnf_error}")
279
+ # print(f"Error: {fnf_error}")
280
+ # except Exception as main_error:
281
+ # logger.error(f"An unexpected error occurred during the main process: {main_error}", exc_info=True)
282
+ # print(f"An unexpected error occurred: {main_error}")
packages.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ tesseract-ocr
2
+ libtesseract-dev
3
+ tesseract-ocr-hin
4
+ tesseract-ocr-ara
5
+ tesseract-ocr-spa
6
+ tesseract-ocr-chi-sim
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ dotenv
3
+ cohere
4
+ langchain
5
+ chromadb
6
+ protobuf~=3.20
7
+ langchain-chroma
8
+ langchain-cohere
9
+ pypdf
10
+ pytesseract
11
+ Pillow