Spaces:
Sleeping
Sleeping
Alphin Jain commited on
Commit ·
334c1a6
0
Parent(s):
First commit
Browse files- LICENSE +21 -0
- README.md +3 -0
- __init__.py +0 -0
- app.py +215 -0
- app/__init__.py +0 -0
- app/config/__init__.py +3 -0
- app/config/settings.py +27 -0
- app/document_processing/__init__.py +3 -0
- app/document_processing/extractors.py +334 -0
- app/retrieval/__init__.py +3 -0
- app/retrieval/vector_store.py +68 -0
- app/summarization/__init__.py +3 -0
- app/summarization/output.py +167 -0
- app/summarization/prompt2.py +627 -0
- app/summarization/summarizer.py +206 -0
- app/utils/__init__.py +3 -0
- app/utils/enviornments.py +0 -0
- app/utils/progress_tracker.py +128 -0
- main.py +282 -0
- packages.txt +6 -0
- requirements.txt +11 -0
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Karan Verma
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Expedition-Aya-Insight
|
| 2 |
+
AI-Powered Multilingual Scientific Summarization with Cohere
|
| 3 |
+
Aya-Insight is a fast, multilingual AI tool that extracts structured, reasoning-driven insights from scientific research papers.
|
__init__.py
ADDED
|
File without changes
|
app.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
import logging
|
| 6 |
+
|
| 7 |
+
# Add the project root to the sys.path to allow importing modules like config, document_processing, etc.
|
| 8 |
+
# This assumes app.py is in the project root directory.
|
| 9 |
+
# Adjust the path if your app.py is in a subdirectory.
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
# Import the necessary functions from your main script print("YES1")
|
| 13 |
+
from main import process_uploaded_files, setup_retrieval_system, summarize_extracted_documents
|
| 14 |
+
# Configure Streamlit's logging to match your application's settings
|
| 15 |
+
logging.basicConfig(level='INFO', format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
logger.info("Streamlit app started and logging configured.")
|
| 18 |
+
|
| 19 |
+
# Flag to check if modules were imported successfully
|
| 20 |
+
modules_loaded = True
|
| 21 |
+
|
| 22 |
+
except ImportError as e:
|
| 23 |
+
st.error(f"Could not import application modules. Please ensure your project structure is correct and dependencies are installed.")
|
| 24 |
+
st.error(f"ImportError: {e}")
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
logger.error(f"Failed to import application modules: {e}", exc_info=True)
|
| 27 |
+
modules_loaded = False # Set flag to False if imports fail
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# --- Streamlit App Configuration ---
|
| 31 |
+
st.set_page_config(
|
| 32 |
+
page_title="Aya Insight Document Summarizer",
|
| 33 |
+
page_icon="📄",
|
| 34 |
+
layout="wide"
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# --- Session State Initialization ---
|
| 38 |
+
# Initialize session state variables if they don't exist
|
| 39 |
+
if 'api_key_entered' not in st.session_state:
|
| 40 |
+
st.session_state.api_key_entered = False
|
| 41 |
+
if 'summary_results' not in st.session_state:
|
| 42 |
+
st.session_state.summary_results = None
|
| 43 |
+
if 'selected_filename' not in st.session_state:
|
| 44 |
+
st.session_state.selected_filename = None
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# --- API Key Input Section ---
|
| 48 |
+
if not st.session_state.api_key_entered:
|
| 49 |
+
st.title("🔒 Enter Your Cohere API Key to Unlock")
|
| 50 |
+
api_key = st.text_input("Cohere API Key", type="password", help="Enter your Cohere API key to use the summarization service.")
|
| 51 |
+
|
| 52 |
+
if st.button("Unlock"):
|
| 53 |
+
if api_key:
|
| 54 |
+
# Basic validation: Just check if it's not empty.
|
| 55 |
+
# For a real application, you might want to validate by making a small API call.
|
| 56 |
+
os.environ["COHERE_API_KEY"] = api_key # Set the environment variable
|
| 57 |
+
st.session_state.api_key_entered = True
|
| 58 |
+
st.success("API Key accepted. You can now upload documents.")
|
| 59 |
+
st.rerun() # Rerun the app to show the main content
|
| 60 |
+
else:
|
| 61 |
+
st.warning("Please enter your Cohere API key.")
|
| 62 |
+
|
| 63 |
+
# --- Main Application Content (Unlocked) ---
|
| 64 |
+
if st.session_state.api_key_entered and modules_loaded:
|
| 65 |
+
st.title("📄 Aya Insight Document Summarizer")
|
| 66 |
+
st.markdown("""
|
| 67 |
+
Upload one or more PDF or image files to get a structured summary for each document.
|
| 68 |
+
""")
|
| 69 |
+
|
| 70 |
+
# --- File Uploader ---
|
| 71 |
+
uploaded_files = st.file_uploader(
|
| 72 |
+
"Choose Document Files",
|
| 73 |
+
type=["pdf", "png", "jpg", "jpeg", "tiff", "bmp", "gif"], # Added image types
|
| 74 |
+
accept_multiple_files=True,
|
| 75 |
+
help="You can upload multiple PDF or image documents here."
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
# --- Summarize Button and Logic ---
|
| 79 |
+
if uploaded_files: # Only show button if files are uploaded
|
| 80 |
+
st.info(f"You have uploaded {len(uploaded_files)} file(s).")
|
| 81 |
+
|
| 82 |
+
if st.button("Generate Summaries", key="summarize_button"):
|
| 83 |
+
st.session_state.selected_filename = None # Reset selected file on new summary generation
|
| 84 |
+
if not uploaded_files:
|
| 85 |
+
st.warning("Please upload at least one file before generating summaries.")
|
| 86 |
+
else:
|
| 87 |
+
st.subheader("Processing Documents...")
|
| 88 |
+
all_summary_results = [] # To store results for display
|
| 89 |
+
|
| 90 |
+
# Use a spinner to indicate processing
|
| 91 |
+
with st.spinner("Processing documents and generating summaries... This may take a few minutes depending on file size and number."):
|
| 92 |
+
try:
|
| 93 |
+
# Step 1: Process uploaded files (Extraction)
|
| 94 |
+
logger.info(f"Calling process_uploaded_files with {len(uploaded_files)} files.")
|
| 95 |
+
extraction_results = process_uploaded_files(uploaded_files)
|
| 96 |
+
logger.info(f"Finished document extraction. {len(extraction_results)} results obtained.")
|
| 97 |
+
|
| 98 |
+
# Check if any files were successfully extracted
|
| 99 |
+
if not any(res.get('text') for res in extraction_results):
|
| 100 |
+
st.error("No text could be extracted from the uploaded files. Please check the file formats.")
|
| 101 |
+
logger.error("No text extracted from any uploaded file.")
|
| 102 |
+
st.session_state.summary_results = [] # Store empty results
|
| 103 |
+
# st.stop() # Don't stop, allow user to try again
|
| 104 |
+
|
| 105 |
+
# Step 2: Setup retrieval system (Vector Store and Embedding)
|
| 106 |
+
logger.info("Calling setup_retrieval_system.")
|
| 107 |
+
extraction_results_with_chunks, retriever = setup_retrieval_system(extraction_results)
|
| 108 |
+
logger.info("Retriever system setup complete.")
|
| 109 |
+
|
| 110 |
+
# Step 3: Summarize the extracted documents
|
| 111 |
+
logger.info("Calling summarize_extracted_documents.")
|
| 112 |
+
summary_results = summarize_extracted_documents(extraction_results_with_chunks, retriever)
|
| 113 |
+
logger.info(f"Finished summarization. {len(summary_results)} summary results obtained.")
|
| 114 |
+
|
| 115 |
+
st.session_state.summary_results = summary_results # Store results in session state
|
| 116 |
+
|
| 117 |
+
except FileNotFoundError as fnf_error:
|
| 118 |
+
st.error(f"Configuration Error: {fnf_error}. Please check your environment settings.")
|
| 119 |
+
logger.error(f"Configuration Error during Streamlit process: {fnf_error}", exc_info=True)
|
| 120 |
+
st.session_state.summary_results = [] # Store empty results on error
|
| 121 |
+
except Exception as e:
|
| 122 |
+
st.error(f"An unexpected error occurred during processing: {e}")
|
| 123 |
+
logger.error(f"An unexpected error occurred during Streamlit process: {e}", exc_info=True)
|
| 124 |
+
st.session_state.summary_results = [] # Store empty results on error
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# --- Display Document Tiles and Summaries ---
|
| 128 |
+
if st.session_state.summary_results is not None:
|
| 129 |
+
st.subheader("Summaries:")
|
| 130 |
+
|
| 131 |
+
if not st.session_state.summary_results:
|
| 132 |
+
st.info("No summaries were generated. Upload files and click 'Generate Summaries'.")
|
| 133 |
+
else:
|
| 134 |
+
# Display files as a grid of clickable tiles
|
| 135 |
+
files_per_row = 3
|
| 136 |
+
rows = len(st.session_state.summary_results) // files_per_row + (len(st.session_state.summary_results) % files_per_row > 0)
|
| 137 |
+
|
| 138 |
+
# Create a list of filenames for easy access
|
| 139 |
+
filenames = [res.get('filename', f'File {i+1}') for i, res in enumerate(st.session_state.summary_results)]
|
| 140 |
+
|
| 141 |
+
for i in range(rows):
|
| 142 |
+
cols = st.columns(files_per_row)
|
| 143 |
+
for j in range(files_per_row):
|
| 144 |
+
file_index = i * files_per_row + j
|
| 145 |
+
if file_index < len(st.session_state.summary_results):
|
| 146 |
+
result = st.session_state.summary_results[file_index]
|
| 147 |
+
filename = result.get('filename', f'File {file_index+1}')
|
| 148 |
+
is_selected = st.session_state.selected_filename == filename
|
| 149 |
+
|
| 150 |
+
# Create a tile using a button or markdown link
|
| 151 |
+
# Using a button inside a column for simplicity
|
| 152 |
+
with cols[j]:
|
| 153 |
+
# Add a border or highlight if selected
|
| 154 |
+
tile_style = "border: 2px solid lightgrey; padding: 10px; margin: 5px; text-align: center; cursor: pointer;"
|
| 155 |
+
if is_selected:
|
| 156 |
+
tile_style = "border: 2px solid steelblue; padding: 10px; margin: 5px; text-align: center; cursor: pointer; background-color: #e6f3ff;" # Highlight color
|
| 157 |
+
|
| 158 |
+
# Use markdown with HTML to create the clickable tile appearance
|
| 159 |
+
# When clicked, set the selected filename in session state
|
| 160 |
+
st.markdown(
|
| 161 |
+
f"""
|
| 162 |
+
<div style="{tile_style}" onclick="document.getElementById('hidden_button_{file_index}').click()">
|
| 163 |
+
📄<br>
|
| 164 |
+
<strong>{filename}</strong>
|
| 165 |
+
</div>
|
| 166 |
+
<button id="hidden_button_{file_index}" style="display: none;" onclick="document.getElementById('hidden_button_{file_index}').click()"></button>
|
| 167 |
+
""",
|
| 168 |
+
unsafe_allow_html=True
|
| 169 |
+
)
|
| 170 |
+
# Streamlit buttons don't work directly with markdown clicks like this easily.
|
| 171 |
+
# A simpler approach is to use a standard button and handle the click.
|
| 172 |
+
# Let's use a standard button instead of complex markdown/JS.
|
| 173 |
+
|
| 174 |
+
# Alternative using a standard button:
|
| 175 |
+
if st.button(f"📄 {filename}", key=f"tile_button_{file_index}"):
|
| 176 |
+
st.session_state.selected_filename = filename
|
| 177 |
+
logger.info(f"Selected file: {filename}")
|
| 178 |
+
st.rerun() # Rerun to display the summary
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# Display summary of the selected file
|
| 182 |
+
if st.session_state.selected_filename:
|
| 183 |
+
st.markdown("---") # Separator
|
| 184 |
+
st.subheader(f"Summary for: {st.session_state.selected_filename}")
|
| 185 |
+
|
| 186 |
+
# Find the summary for the selected file
|
| 187 |
+
selected_summary = None
|
| 188 |
+
selected_result = None
|
| 189 |
+
for result in st.session_state.summary_results:
|
| 190 |
+
if result.get('filename') == st.session_state.selected_filename:
|
| 191 |
+
selected_summary = result.get('summary')
|
| 192 |
+
selected_result = result
|
| 193 |
+
break
|
| 194 |
+
|
| 195 |
+
if selected_summary:
|
| 196 |
+
if selected_result.get('success'):
|
| 197 |
+
st.markdown(selected_summary) # Render markdown summary
|
| 198 |
+
else:
|
| 199 |
+
st.error(f"Could not load summary for {st.session_state.selected_filename}: {selected_result.get('error', 'Unknown error')}")
|
| 200 |
+
else:
|
| 201 |
+
st.info(f"Summary not available for {st.session_state.selected_filename}.")
|
| 202 |
+
|
| 203 |
+
# Display overall processing status
|
| 204 |
+
successful_count = sum(res.get('success', False) for res in st.session_state.summary_results)
|
| 205 |
+
total_files = len(st.session_state.summary_results)
|
| 206 |
+
st.markdown(f"---") # Final separator
|
| 207 |
+
st.success(f"Processed {total_files} files. Successfully summarized {successful_count}.")
|
| 208 |
+
if successful_count < total_files:
|
| 209 |
+
st.warning("Some files could not be processed or summarized. See error messages above.")
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
# --- Message if API Key is not entered and modules loaded ---
|
| 213 |
+
if not st.session_state.api_key_entered:
|
| 214 |
+
st.info("Enter your Cohere API Key above to unlock the application functionality.")
|
| 215 |
+
|
app/__init__.py
ADDED
|
File without changes
|
app/config/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration package.
|
| 3 |
+
"""
|
app/config/settings.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
# Load environment variables from a .env file
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
# Configure logging for the httpx library to suppress warnings
|
| 9 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
| 10 |
+
|
| 11 |
+
# Define default paths for documents and summaries output
|
| 12 |
+
DOCS_FOLDER = os.getenv("DOCS_FOLDER", "samples/pdf5")
|
| 13 |
+
SUMMARIES_OUTPUT_DIR = os.getenv("SUMMARIES_OUTPUT_DIR", "summaries")
|
| 14 |
+
|
| 15 |
+
# Create the output directory if it doesn't exist
|
| 16 |
+
os.makedirs(SUMMARIES_OUTPUT_DIR, exist_ok=True)
|
| 17 |
+
|
| 18 |
+
# Define models for embedding, reranking, and language model
|
| 19 |
+
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "embed-v4.0")
|
| 20 |
+
COHERERANK_MODEL = os.getenv('COHERERANK_MODEL', 'rerank-v3.5')
|
| 21 |
+
LLM_MODEL = os.getenv("LLM_MODEL", "command-a-03-2025")
|
| 22 |
+
|
| 23 |
+
# Define settings for text splitting and retrieval
|
| 24 |
+
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1000"))
|
| 25 |
+
CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "100"))
|
| 26 |
+
COHERERANK_TOPN = int(os.getenv("COHERERANK_TOPN", "100"))
|
| 27 |
+
VECTOSTORE_TOPK = int(os.getenv("VECTOSTORE_TOPK", "100"))
|
app/document_processing/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document processing package.
|
| 3 |
+
"""
|
app/document_processing/extractors.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document extraction functionality for processing documents.
|
| 3 |
+
"""
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import concurrent.futures
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
import cohere
|
| 10 |
+
import logging
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import List, Dict, Any, Optional
|
| 13 |
+
from langchain.docstore.document import Document
|
| 14 |
+
from ..config.settings import CHUNK_SIZE, LLM_MODEL
|
| 15 |
+
|
| 16 |
+
# Configure logging with a null handler by default
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
logger.addHandler(logging.NullHandler())
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class DocumentProcessor:
|
| 22 |
+
"""Base class for document processors"""
|
| 23 |
+
|
| 24 |
+
def __init__(self):
|
| 25 |
+
self.supported_extensions = []
|
| 26 |
+
|
| 27 |
+
def can_process(self, file_path: str) -> bool:
|
| 28 |
+
"""Check if the processor can handle this file type"""
|
| 29 |
+
ext = Path(file_path).suffix.lower()
|
| 30 |
+
return ext in self.supported_extensions
|
| 31 |
+
|
| 32 |
+
def process(self, file_path: str, **kwargs) -> str:
|
| 33 |
+
"""Process the document and extract text"""
|
| 34 |
+
raise NotImplementedError("Subclasses must implement this method")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class PdfProcessor(DocumentProcessor):
|
| 38 |
+
"""Processor for PDF documents"""
|
| 39 |
+
|
| 40 |
+
def __init__(self):
|
| 41 |
+
super().__init__()
|
| 42 |
+
self.supported_extensions = ['.pdf']
|
| 43 |
+
|
| 44 |
+
def process(self, file_path: str, **kwargs) -> str:
|
| 45 |
+
"""Extract text from a PDF file"""
|
| 46 |
+
try:
|
| 47 |
+
# Import here to avoid dependency if not used
|
| 48 |
+
from pypdf import PdfReader
|
| 49 |
+
|
| 50 |
+
logger.debug(f"Processing PDF: {file_path}")
|
| 51 |
+
reader = PdfReader(file_path)
|
| 52 |
+
text = ""
|
| 53 |
+
for page in reader.pages:
|
| 54 |
+
text += page.extract_text() + "\n"
|
| 55 |
+
return text.strip()
|
| 56 |
+
except Exception as e:
|
| 57 |
+
logger.error(f"Error processing PDF {file_path}: {e}")
|
| 58 |
+
raise
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class ImageProcessor(DocumentProcessor):
|
| 62 |
+
"""Processor for image files"""
|
| 63 |
+
|
| 64 |
+
def __init__(self):
|
| 65 |
+
super().__init__()
|
| 66 |
+
self.supported_extensions = ['.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif']
|
| 67 |
+
# Default languages including multiple options
|
| 68 |
+
self.default_languages = "eng+fra+hin+spa+chi-sim"
|
| 69 |
+
|
| 70 |
+
def process(self, file_path: str, **kwargs) -> str:
|
| 71 |
+
"""Extract text from an image file using OCR"""
|
| 72 |
+
try:
|
| 73 |
+
# Import here to avoid dependency if not used
|
| 74 |
+
import pytesseract
|
| 75 |
+
from PIL import Image
|
| 76 |
+
|
| 77 |
+
# Use the expanded default languages if not specified
|
| 78 |
+
lang = kwargs.get('lang', self.default_languages)
|
| 79 |
+
logger.debug(f"Processing image: {file_path} with languages: {lang}")
|
| 80 |
+
image = Image.open(file_path)
|
| 81 |
+
text = pytesseract.image_to_string(image, lang=lang)
|
| 82 |
+
return text.strip()
|
| 83 |
+
except Exception as e:
|
| 84 |
+
logger.error(f"Error processing image {file_path}: {e}")
|
| 85 |
+
raise
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
class DocumentExtractor:
|
| 89 |
+
"""Main class for document text extraction"""
|
| 90 |
+
|
| 91 |
+
def __init__(self):
|
| 92 |
+
"""Initialize with default processors"""
|
| 93 |
+
self.processors = [
|
| 94 |
+
PdfProcessor(),
|
| 95 |
+
ImageProcessor()
|
| 96 |
+
]
|
| 97 |
+
self.cohere_client = None
|
| 98 |
+
|
| 99 |
+
def add_processor(self, processor: DocumentProcessor) -> None:
|
| 100 |
+
"""Add a custom document processor"""
|
| 101 |
+
self.processors.append(processor)
|
| 102 |
+
|
| 103 |
+
def get_processor(self, file_path: str) -> Optional[DocumentProcessor]:
|
| 104 |
+
"""Get the appropriate processor for a file"""
|
| 105 |
+
for processor in self.processors:
|
| 106 |
+
if processor.can_process(file_path):
|
| 107 |
+
return processor
|
| 108 |
+
return None
|
| 109 |
+
|
| 110 |
+
def get_language(self, text: str) -> str:
|
| 111 |
+
"""
|
| 112 |
+
Detect the language of the provided text using Cohere API.
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
text: Text sample to analyze
|
| 116 |
+
|
| 117 |
+
Returns:
|
| 118 |
+
String containing the detected language name
|
| 119 |
+
"""
|
| 120 |
+
try:
|
| 121 |
+
# Initialize client if not already done
|
| 122 |
+
start = time.time()
|
| 123 |
+
if not self.cohere_client:
|
| 124 |
+
self.cohere_client = cohere.Client()
|
| 125 |
+
|
| 126 |
+
prompt = f"What language is this sentence written in?\n\n{text}\n\nRespond only with the language name."
|
| 127 |
+
response = self.cohere_client.chat(
|
| 128 |
+
model=LLM_MODEL,
|
| 129 |
+
message= prompt,
|
| 130 |
+
max_tokens=100,
|
| 131 |
+
temperature=0.2,
|
| 132 |
+
)
|
| 133 |
+
return response.text
|
| 134 |
+
|
| 135 |
+
except Exception as e:
|
| 136 |
+
logger.error(f"Error detecting language: {e}")
|
| 137 |
+
return "unknown"
|
| 138 |
+
|
| 139 |
+
def process_file(self, file_path: str, **kwargs) -> Dict[str, Any]:
|
| 140 |
+
"""
|
| 141 |
+
Process a single file based on its extension.
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
file_path: Path to the file
|
| 145 |
+
**kwargs: Additional processing options
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
Dictionary containing processing results and metadata
|
| 149 |
+
"""
|
| 150 |
+
result = {
|
| 151 |
+
"file_path": file_path,
|
| 152 |
+
"filename": Path(file_path).name,
|
| 153 |
+
"text": "",
|
| 154 |
+
"error": None,
|
| 155 |
+
"type": None,
|
| 156 |
+
"language": None,
|
| 157 |
+
"chunk_size": 0
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
try:
|
| 161 |
+
processor = self.get_processor(file_path)
|
| 162 |
+
|
| 163 |
+
if processor:
|
| 164 |
+
text = processor.process(file_path, **kwargs)
|
| 165 |
+
result["text"] = text
|
| 166 |
+
result["language"] = self.get_language(text[:CHUNK_SIZE]) if text else None
|
| 167 |
+
result["type"] = processor.__class__.__name__.lower().replace('processor', '')
|
| 168 |
+
else:
|
| 169 |
+
ext = Path(file_path).suffix.lower()
|
| 170 |
+
result["error"] = f"Unsupported file type: {ext}"
|
| 171 |
+
except Exception as e:
|
| 172 |
+
result["error"] = str(e)
|
| 173 |
+
|
| 174 |
+
return result
|
| 175 |
+
|
| 176 |
+
def process_files(self, file_paths: List[str], **kwargs) -> List[Dict[str, Any]]:
|
| 177 |
+
"""
|
| 178 |
+
Process multiple files in parallel.
|
| 179 |
+
|
| 180 |
+
Args:
|
| 181 |
+
file_paths: List of file paths to process
|
| 182 |
+
**kwargs: Additional processing options
|
| 183 |
+
(max_workers: max number of processes)
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
List of dictionaries with processing results
|
| 187 |
+
"""
|
| 188 |
+
max_workers = kwargs.pop('max_workers', os.cpu_count() or 1)
|
| 189 |
+
logger.info(f"Processing {len(file_paths)} files with {max_workers} workers")
|
| 190 |
+
|
| 191 |
+
results = []
|
| 192 |
+
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers * 2) as executor:
|
| 193 |
+
futures = {
|
| 194 |
+
executor.submit(self.process_file, file_path, **kwargs): file_path
|
| 195 |
+
for file_path in file_paths
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
for future in concurrent.futures.as_completed(futures):
|
| 199 |
+
file_path = futures[future]
|
| 200 |
+
try:
|
| 201 |
+
result = future.result()
|
| 202 |
+
results.append(result)
|
| 203 |
+
except Exception as e:
|
| 204 |
+
logger.error(f"Exception processing {file_path}: {e}")
|
| 205 |
+
results.append({
|
| 206 |
+
"filepath": file_path,
|
| 207 |
+
"filename": Path(file_path).name,
|
| 208 |
+
"text": "",
|
| 209 |
+
"error": str(e),
|
| 210 |
+
"type": None,
|
| 211 |
+
"langugae": None,
|
| 212 |
+
"chunk_size": 0
|
| 213 |
+
})
|
| 214 |
+
|
| 215 |
+
return results
|
| 216 |
+
|
| 217 |
+
def find_supported_files(self, folder_path: str, recursive: bool = True) -> List[str]:
|
| 218 |
+
"""
|
| 219 |
+
Get all supported files in a folder.
|
| 220 |
+
|
| 221 |
+
Args:
|
| 222 |
+
folder_path: Path to the folder
|
| 223 |
+
recursive: Whether to include subfolders
|
| 224 |
+
|
| 225 |
+
Returns:
|
| 226 |
+
List of file paths
|
| 227 |
+
"""
|
| 228 |
+
# Get all supported extensions from processors
|
| 229 |
+
supported_extensions = []
|
| 230 |
+
for processor in self.processors:
|
| 231 |
+
supported_extensions.extend(processor.supported_extensions)
|
| 232 |
+
|
| 233 |
+
file_paths = []
|
| 234 |
+
|
| 235 |
+
if recursive:
|
| 236 |
+
for root, _, files in os.walk(folder_path):
|
| 237 |
+
for file in files:
|
| 238 |
+
file_path = os.path.join(root, file)
|
| 239 |
+
if Path(file).suffix.lower() in supported_extensions:
|
| 240 |
+
file_paths.append(file_path)
|
| 241 |
+
else:
|
| 242 |
+
for file in os.listdir(folder_path):
|
| 243 |
+
file_path = os.path.join(folder_path, file)
|
| 244 |
+
if os.path.isfile(file_path) and Path(file).suffix.lower() in supported_extensions:
|
| 245 |
+
file_paths.append(file_path)
|
| 246 |
+
|
| 247 |
+
return file_paths
|
| 248 |
+
|
| 249 |
+
def process_folder(self, folder_path: str, recursive: bool = True, **kwargs) -> List[Dict[str, Any]]:
|
| 250 |
+
"""
|
| 251 |
+
Process all supported files in a folder.
|
| 252 |
+
|
| 253 |
+
Args:
|
| 254 |
+
folder_path: Path to the folder containing documents
|
| 255 |
+
recursive: Whether to process subfolders recursively
|
| 256 |
+
**kwargs: Additional processing options
|
| 257 |
+
|
| 258 |
+
Returns:
|
| 259 |
+
List of dictionaries with processing results
|
| 260 |
+
"""
|
| 261 |
+
file_paths = self.find_supported_files(folder_path, recursive)
|
| 262 |
+
logger.info(f"Found {len(file_paths)} supported files in {folder_path}")
|
| 263 |
+
|
| 264 |
+
return self.process_files(file_paths, **kwargs)
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
class FileOutputManager:
|
| 268 |
+
"""Class for managing output of extracted text"""
|
| 269 |
+
|
| 270 |
+
def __init__(self, output_dir: str = "extracted_texts"):
|
| 271 |
+
"""Initialize with output directory"""
|
| 272 |
+
self.output_dir = output_dir
|
| 273 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 274 |
+
|
| 275 |
+
def save_results(self, results: List[Dict[str, Any]]) -> Dict[str, int]:
|
| 276 |
+
"""
|
| 277 |
+
Save extracted text to files.
|
| 278 |
+
|
| 279 |
+
Args:
|
| 280 |
+
results: List of processing results
|
| 281 |
+
|
| 282 |
+
Returns:
|
| 283 |
+
Dictionary with counts of successful and failed saves
|
| 284 |
+
"""
|
| 285 |
+
stats = {"success": 0, "skipped": 0, "failed": 0}
|
| 286 |
+
|
| 287 |
+
for result in results:
|
| 288 |
+
if not result["text"]:
|
| 289 |
+
stats["skipped"] += 1
|
| 290 |
+
continue
|
| 291 |
+
|
| 292 |
+
try:
|
| 293 |
+
# Create filename with original name + file type
|
| 294 |
+
base_name = Path(result['filename']).stem
|
| 295 |
+
file_type = result.get('type', 'unknown')
|
| 296 |
+
output_filename = f"{base_name}_{file_type}.txt"
|
| 297 |
+
|
| 298 |
+
output_path = os.path.join(self.output_dir, output_filename)
|
| 299 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 300 |
+
f.write(result["text"])
|
| 301 |
+
stats["success"] += 1
|
| 302 |
+
except Exception as e:
|
| 303 |
+
logger.error(f"Error saving text from {result['file_path']}: {e}")
|
| 304 |
+
stats["failed"] += 1
|
| 305 |
+
|
| 306 |
+
return stats
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
# Adapter class to convert DocumentExtractor results to langchain Document objects
|
| 310 |
+
class DocumentProcessorAdapter:
|
| 311 |
+
"""
|
| 312 |
+
Adapter to process documents and convert them to langchain Document objects.
|
| 313 |
+
"""
|
| 314 |
+
def __init__(self):
|
| 315 |
+
"""Initialize document processor adapter with the extractor."""
|
| 316 |
+
self.extractor = DocumentExtractor()
|
| 317 |
+
|
| 318 |
+
def process_folder(self, folder_path):
|
| 319 |
+
"""
|
| 320 |
+
Process all documents in a folder.
|
| 321 |
+
|
| 322 |
+
Args:
|
| 323 |
+
folder_path (str): Path to the folder containing documents
|
| 324 |
+
|
| 325 |
+
Returns:
|
| 326 |
+
tuple: (list of langchain Document objects, original extraction results)
|
| 327 |
+
"""
|
| 328 |
+
if not os.path.exists(folder_path):
|
| 329 |
+
raise FileNotFoundError(f"Folder not found: {folder_path}")
|
| 330 |
+
|
| 331 |
+
# Extract content from documents
|
| 332 |
+
extraction_results = self.extractor.process_folder(folder_path)
|
| 333 |
+
print(f"Processed {len(extraction_results)} documents")
|
| 334 |
+
return extraction_results
|
app/retrieval/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Retrieval package.
|
| 3 |
+
"""
|
app/retrieval/vector_store.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Vector database operations for document storage and retrieval.
|
| 3 |
+
"""
|
| 4 |
+
from langchain_chroma import Chroma
|
| 5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 6 |
+
from langchain_cohere import CohereEmbeddings
|
| 7 |
+
from langchain_core.documents import Document
|
| 8 |
+
|
| 9 |
+
from ..config.settings import CHUNK_SIZE, CHUNK_OVERLAP, EMBEDDING_MODEL, COHERERANK_MODEL, COHERERANK_TOPN, VECTOSTORE_TOPK
|
| 10 |
+
import cohere
|
| 11 |
+
|
| 12 |
+
class Retriever:
|
| 13 |
+
"""
|
| 14 |
+
Wrapper for vector database operations.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
def __init__(self, model=EMBEDDING_MODEL):
|
| 18 |
+
self.cohere_client = cohere.Client()
|
| 19 |
+
self.chroma_db = None
|
| 20 |
+
self.embedding_model = CohereEmbeddings(model=model)
|
| 21 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 22 |
+
chunk_size=CHUNK_SIZE,
|
| 23 |
+
chunk_overlap=CHUNK_OVERLAP
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
def create_from_documents(self, extraction_results):
|
| 27 |
+
chunks = []
|
| 28 |
+
for result in extraction_results:
|
| 29 |
+
filename = result['filename']
|
| 30 |
+
text = result['text']
|
| 31 |
+
if text:
|
| 32 |
+
document = Document(
|
| 33 |
+
page_content=text,
|
| 34 |
+
metadata={"filename": filename}
|
| 35 |
+
)
|
| 36 |
+
doc_chunks = self.text_splitter.split_documents([document])
|
| 37 |
+
result['chunk_size'] = len(doc_chunks)
|
| 38 |
+
chunks.extend(doc_chunks)
|
| 39 |
+
|
| 40 |
+
self.chroma_db = Chroma.from_documents(
|
| 41 |
+
chunks,
|
| 42 |
+
embedding=self.embedding_model
|
| 43 |
+
)
|
| 44 |
+
return extraction_results
|
| 45 |
+
|
| 46 |
+
def similarity_search(self, query, k=5, filter=None):
|
| 47 |
+
if not self.chroma_db:
|
| 48 |
+
raise ValueError("Vector store has not been initialized with documents")
|
| 49 |
+
|
| 50 |
+
return self.chroma_db.similarity_search(query=query, k=k, filter=filter)
|
| 51 |
+
|
| 52 |
+
def reranking(self, query, docs, top_n=10):
|
| 53 |
+
doc_texts = [doc.page_content for doc in docs]
|
| 54 |
+
rerank_response = self.cohere_client.rerank(model=COHERERANK_MODEL, query=query, documents=doc_texts, top_n=top_n)
|
| 55 |
+
# return [docs[result.index] for result in rerank_response.results]
|
| 56 |
+
return [docs[result.index].page_content for result in rerank_response.results]
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def get_relevant_docs(self, chromdb_query, rerank_query, filter, chunk_size):
|
| 60 |
+
dense_topk = min(chunk_size, VECTOSTORE_TOPK)
|
| 61 |
+
reranking_topk = min(chunk_size, COHERERANK_TOPN)
|
| 62 |
+
docs = self.similarity_search(chromdb_query, filter=filter, k=dense_topk)
|
| 63 |
+
if docs:
|
| 64 |
+
return self.reranking(rerank_query, docs, top_n=reranking_topk)
|
| 65 |
+
return []
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
|
app/summarization/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Summarization package.
|
| 3 |
+
"""
|
app/summarization/output.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Output handling for document summaries in multiple formats.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from ..config.settings import SUMMARIES_OUTPUT_DIR
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class SummaryOutputManager:
|
| 9 |
+
"""
|
| 10 |
+
Manager for saving and retrieving document summaries in multiple formats.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, output_dir=SUMMARIES_OUTPUT_DIR):
|
| 14 |
+
"""
|
| 15 |
+
Initialize output manager with output directory.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
output_dir (str): Directory to save summaries
|
| 19 |
+
"""
|
| 20 |
+
self.output_dir = output_dir
|
| 21 |
+
self._ensure_output_dir()
|
| 22 |
+
|
| 23 |
+
def _ensure_output_dir(self):
|
| 24 |
+
"""Create output directory if it doesn't exist."""
|
| 25 |
+
if not os.path.exists(self.output_dir):
|
| 26 |
+
os.makedirs(self.output_dir)
|
| 27 |
+
print(f"Created output directory: {self.output_dir}")
|
| 28 |
+
|
| 29 |
+
def save_summary(self, filename, summary, formats=None):
|
| 30 |
+
"""
|
| 31 |
+
Save a document summary to files in specified formats.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
filename (str): Name of the original document
|
| 35 |
+
summary (str): Summary text
|
| 36 |
+
formats (list): List of formats to save. Defaults to ['markdown', 'html']
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
dict: Paths to the saved summary files by format
|
| 40 |
+
"""
|
| 41 |
+
if formats is None:
|
| 42 |
+
formats = ['markdown', 'html']
|
| 43 |
+
|
| 44 |
+
output_paths = {}
|
| 45 |
+
|
| 46 |
+
# Generate and save in each requested format
|
| 47 |
+
for fmt in formats:
|
| 48 |
+
if fmt == 'markdown':
|
| 49 |
+
output_paths['markdown'] = self._save_markdown(filename, summary)
|
| 50 |
+
elif fmt == 'html':
|
| 51 |
+
output_paths['html'] = self._save_html(filename, summary)
|
| 52 |
+
else:
|
| 53 |
+
print(f"Warning: Unsupported format '{fmt}' requested")
|
| 54 |
+
|
| 55 |
+
return output_paths
|
| 56 |
+
|
| 57 |
+
def _save_markdown(self, filename, summary):
|
| 58 |
+
"""
|
| 59 |
+
Save a document summary to a markdown file.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
filename (str): Name of the original document
|
| 63 |
+
summary (str): Summary text
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
str: Path to the saved markdown file
|
| 67 |
+
"""
|
| 68 |
+
# Create markdown output
|
| 69 |
+
markdown_content = f""
|
| 70 |
+
markdown_content += summary
|
| 71 |
+
markdown_content += "\n\n---\n"
|
| 72 |
+
|
| 73 |
+
# Save to file
|
| 74 |
+
output_path = os.path.join(self.output_dir, f"{filename}.md")
|
| 75 |
+
with open(output_path, "w") as f:
|
| 76 |
+
f.write(markdown_content)
|
| 77 |
+
|
| 78 |
+
print(f"Saved markdown summary to: {output_path}")
|
| 79 |
+
return output_path
|
| 80 |
+
|
| 81 |
+
def _save_html(self, filename, summary):
|
| 82 |
+
"""
|
| 83 |
+
Save a document summary to an HTML file.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
filename (str): Name of the original document
|
| 87 |
+
summary (str): Summary text
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
str: Path to the saved HTML file
|
| 91 |
+
"""
|
| 92 |
+
# Convert summary to HTML paragraphs
|
| 93 |
+
paragraphs = summary.split('\n\n')
|
| 94 |
+
html_paragraphs = ''.join([f"<p>{p}</p>" for p in paragraphs if p.strip()])
|
| 95 |
+
|
| 96 |
+
# Create HTML output with basic styling
|
| 97 |
+
html_content = f"""<!DOCTYPE html>
|
| 98 |
+
<html>
|
| 99 |
+
<head>
|
| 100 |
+
<meta charset="UTF-8">
|
| 101 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 102 |
+
<title>Summary for {filename}</title>
|
| 103 |
+
<style>
|
| 104 |
+
body {{
|
| 105 |
+
font-family: Arial, sans-serif;
|
| 106 |
+
line-height: 1.6;
|
| 107 |
+
margin: 0;
|
| 108 |
+
padding: 20px;
|
| 109 |
+
max-width: 800px;
|
| 110 |
+
margin: 0 auto;
|
| 111 |
+
color: #333;
|
| 112 |
+
}}
|
| 113 |
+
h1 {{
|
| 114 |
+
color: #2c3e50;
|
| 115 |
+
border-bottom: 1px solid #eee;
|
| 116 |
+
padding-bottom: 10px;
|
| 117 |
+
}}
|
| 118 |
+
p {{
|
| 119 |
+
margin-bottom: 16px;
|
| 120 |
+
}}
|
| 121 |
+
.footer {{
|
| 122 |
+
margin-top: 30px;
|
| 123 |
+
padding-top: 10px;
|
| 124 |
+
border-top: 1px solid #eee;
|
| 125 |
+
font-size: 0.9em;
|
| 126 |
+
color: #7f8c8d;
|
| 127 |
+
}}
|
| 128 |
+
</style>
|
| 129 |
+
</head>
|
| 130 |
+
<body>
|
| 131 |
+
<h1>Summary for {filename}</h1>
|
| 132 |
+
<div class="content">
|
| 133 |
+
{html_paragraphs}
|
| 134 |
+
</div>
|
| 135 |
+
<div class="footer">
|
| 136 |
+
<p>Generated summary</p>
|
| 137 |
+
</div>
|
| 138 |
+
</body>
|
| 139 |
+
</html>
|
| 140 |
+
"""
|
| 141 |
+
# Save to file
|
| 142 |
+
output_path = os.path.join(self.output_dir, f"{filename}.html")
|
| 143 |
+
with open(output_path, "w") as f:
|
| 144 |
+
f.write(html_content)
|
| 145 |
+
|
| 146 |
+
print(f"Saved HTML summary to: {output_path}")
|
| 147 |
+
return output_path
|
| 148 |
+
|
| 149 |
+
def get_available_formats(self, filename):
|
| 150 |
+
"""
|
| 151 |
+
Check which formats are available for a given file.
|
| 152 |
+
|
| 153 |
+
Args:
|
| 154 |
+
filename (str): Base filename to check
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
list: Available formats for this file
|
| 158 |
+
"""
|
| 159 |
+
available_formats = []
|
| 160 |
+
base_name = os.path.splitext(filename)[0]
|
| 161 |
+
|
| 162 |
+
if os.path.exists(os.path.join(self.output_dir, f"{base_name}.md")):
|
| 163 |
+
available_formats.append('markdown')
|
| 164 |
+
if os.path.exists(os.path.join(self.output_dir, f"{base_name}.html")):
|
| 165 |
+
available_formats.append('html')
|
| 166 |
+
|
| 167 |
+
return available_formats
|
app/summarization/prompt2.py
ADDED
|
@@ -0,0 +1,627 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Research Paper Summarization: Key Points Extraction for Researchers
|
| 3 |
+
Note: Always return the summary in the same language as the original paper.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# # Mini-Prompt 1: Basic Information
|
| 8 |
+
# basic_info_prompt = """
|
| 9 |
+
# Extract the essential identifying information from this research paper.
|
| 10 |
+
#
|
| 11 |
+
# | Information | Details |
|
| 12 |
+
# |-------------|---------|
|
| 13 |
+
# | Title | [Paper title] |
|
| 14 |
+
# | Authors | [Author names] |
|
| 15 |
+
# | Publication | [Journal/Conference, Year] |
|
| 16 |
+
# | Field | [Research domain] |
|
| 17 |
+
# | Keywords | [Key terms] |
|
| 18 |
+
# | DOI/URL | [Link if available] |
|
| 19 |
+
# """
|
| 20 |
+
#
|
| 21 |
+
# # Mini-Prompt 2: Research Objectives & Abstract
|
| 22 |
+
# objectives_prompt = """
|
| 23 |
+
# Extract the following:
|
| 24 |
+
#
|
| 25 |
+
# 1. Primary research question/objective (2-3 bullet points)
|
| 26 |
+
# 2. Condensed abstract summary (2-3 sentences)
|
| 27 |
+
# 3. Main contributions (3-5 bullet points)
|
| 28 |
+
# 4. Theoretical foundation/frameworks
|
| 29 |
+
#
|
| 30 |
+
# Keep each point short and focused on a single idea.
|
| 31 |
+
# """
|
| 32 |
+
#
|
| 33 |
+
# # Mini-Prompt 3: Methodology Details
|
| 34 |
+
# methodology_prompt = """
|
| 35 |
+
# Extract key methodology details:
|
| 36 |
+
#
|
| 37 |
+
# 1. Base model(s) used
|
| 38 |
+
# 2. Architecture summary
|
| 39 |
+
# 3. Dataset(s) with name, size, and characteristics
|
| 40 |
+
# 4. Experimental setup (conditions, controls, parameters)
|
| 41 |
+
# 5. Implementation details (hardware/software used)
|
| 42 |
+
#
|
| 43 |
+
# Format as a structured table with clear categories.
|
| 44 |
+
# """
|
| 45 |
+
#
|
| 46 |
+
# # Mini-Prompt 4: Key Equations & Technical Approach
|
| 47 |
+
# equations_prompt = """
|
| 48 |
+
# Extract and explain key equations:
|
| 49 |
+
#
|
| 50 |
+
# | Equation | Purpose | Explanation |
|
| 51 |
+
# |----------|---------|-------------|
|
| 52 |
+
# | [Equation 1] | [What it calculates] | [Explanation] |
|
| 53 |
+
# | [Equation 2] | [What it calculates] | [Explanation] |
|
| 54 |
+
#
|
| 55 |
+
# Include 2-3 equations central to the methodology or findings.
|
| 56 |
+
# """
|
| 57 |
+
#
|
| 58 |
+
# # Mini-Prompt 5: Results & Performance
|
| 59 |
+
# results_prompt = """
|
| 60 |
+
# Summarize the performance:
|
| 61 |
+
#
|
| 62 |
+
# 1. 5-7 bullet points for primary findings
|
| 63 |
+
# 2. Performance metrics (accuracy, F1, etc.)
|
| 64 |
+
# 3. Comparison with previous work or baselines
|
| 65 |
+
# 4. Highlights from ablation studies
|
| 66 |
+
#
|
| 67 |
+
# Present performance in a table if applicable:
|
| 68 |
+
#
|
| 69 |
+
# | Metric | Value | Compared to Baseline |
|
| 70 |
+
# |--------|-------|-----------------------|
|
| 71 |
+
# """
|
| 72 |
+
#
|
| 73 |
+
# # Mini-Prompt 6: Critical Analysis
|
| 74 |
+
# analysis_prompt = """
|
| 75 |
+
# Identify strengths and weaknesses:
|
| 76 |
+
#
|
| 77 |
+
# | Strengths | Limitations |
|
| 78 |
+
# |-----------|-------------|
|
| 79 |
+
# | [Point 1] | [Limitation 1] |
|
| 80 |
+
# | [Point 2] | [Limitation 2] |
|
| 81 |
+
# | [Point 3] | [Limitation 3] |
|
| 82 |
+
#
|
| 83 |
+
# Also note:
|
| 84 |
+
# - Key assumptions made
|
| 85 |
+
# - Generalizability of findings
|
| 86 |
+
# - Ethical concerns, if mentioned
|
| 87 |
+
# """
|
| 88 |
+
#
|
| 89 |
+
# # Mini-Prompt 7: Implications & Future Work
|
| 90 |
+
# implications_prompt = """
|
| 91 |
+
# Extract future-oriented content:
|
| 92 |
+
#
|
| 93 |
+
# 1. Research implications (field-level impact)
|
| 94 |
+
# 2. Practical/industry applications
|
| 95 |
+
# 3. Future research directions
|
| 96 |
+
# 4. Unanswered or open questions
|
| 97 |
+
#
|
| 98 |
+
# Use 3-4 concise bullet points.
|
| 99 |
+
# """
|
| 100 |
+
#
|
| 101 |
+
# # Mini-Prompt 8: Executive Summary
|
| 102 |
+
# executive_summary_prompt = """
|
| 103 |
+
# Create a concise executive summary (max 250 words):
|
| 104 |
+
#
|
| 105 |
+
# 1. One-paragraph overview
|
| 106 |
+
# 2. Problem being addressed
|
| 107 |
+
# 3. Approach and methods
|
| 108 |
+
# 4. Key results
|
| 109 |
+
# 5. Significance of the findings
|
| 110 |
+
# """
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
# basic_info_prompt = """You are extracting basic information from a research paper. Please provide:
|
| 114 |
+
#
|
| 115 |
+
# 1. Paper title
|
| 116 |
+
# 2. Authors and affiliations
|
| 117 |
+
# 3. Publication date and venue/journal
|
| 118 |
+
# 4. DOI/URL if available
|
| 119 |
+
# 5. Citation information
|
| 120 |
+
# 6. Research domain/field
|
| 121 |
+
#
|
| 122 |
+
# Format as a simple table with categories and details."""
|
| 123 |
+
#
|
| 124 |
+
# # Mini-Prompt 2: Research Objectives and Abstract
|
| 125 |
+
# objectives_prompt = """Based on the research paper, please extract:
|
| 126 |
+
#
|
| 127 |
+
# 1. The primary research question/objective (2-3 bullet points)
|
| 128 |
+
# 2. A condensed abstract summary (2-3 sentences capturing the essence)
|
| 129 |
+
# 3. The main contributions (3-5 key bullet points)
|
| 130 |
+
# 4. The theoretical foundation/frameworks underlying the research
|
| 131 |
+
#
|
| 132 |
+
# Keep each bullet point to 1-2 sentences, focused on a single idea."""
|
| 133 |
+
#
|
| 134 |
+
# # Mini-Prompt 3: Methodology Details
|
| 135 |
+
# methodology_prompt = """Extract the key methodological details from the paper:
|
| 136 |
+
#
|
| 137 |
+
# 1. Base model(s) used (if applicable)
|
| 138 |
+
# 2. System/model architecture (concise description)
|
| 139 |
+
# 3. Datasets used (names, sizes, characteristics)
|
| 140 |
+
# 4. Experimental setup (conditions, controls, parameters)
|
| 141 |
+
# 5. Implementation details (hardware, software, computational resources)
|
| 142 |
+
#
|
| 143 |
+
# Present this information in a structured table format."""
|
| 144 |
+
#
|
| 145 |
+
# # Mini-Prompt 4: Key Equations and Technical Approach
|
| 146 |
+
# equations_prompt = """Identify and explain the most important equations and technical approaches:
|
| 147 |
+
#
|
| 148 |
+
# 1. Extract 2-3 key equations/formulations
|
| 149 |
+
# 2. For each equation, explain:
|
| 150 |
+
# - What it calculates
|
| 151 |
+
# - Its purpose in the paper
|
| 152 |
+
# - How it relates to the overall methodology
|
| 153 |
+
#
|
| 154 |
+
# Format as a table with columns for Equation, Purpose, and Explanation."""
|
| 155 |
+
#
|
| 156 |
+
# # Mini-Prompt 5: Results and Performance
|
| 157 |
+
# results_prompt = """Summarize the main results and performance metrics:
|
| 158 |
+
#
|
| 159 |
+
# 1. Primary findings (5-7 bullet points)
|
| 160 |
+
# 2. Performance metrics (accuracy, F1, BLEU, etc.)
|
| 161 |
+
# 3. Comparison to prior or competing approaches
|
| 162 |
+
# 4. Key insights from ablation studies
|
| 163 |
+
#
|
| 164 |
+
# Present performance metrics in a table with columns for Metric, Value, and Comparison to Previous Work."""
|
| 165 |
+
#
|
| 166 |
+
# # Mini-Prompt 6: Critical Analysis
|
| 167 |
+
# analysis_prompt = """Analyze the strengths and limitations of the paper:
|
| 168 |
+
#
|
| 169 |
+
# 1. Clearly stated limitations (3-4 bullet points)
|
| 170 |
+
# 2. Key assumptions made by the authors
|
| 171 |
+
# 3. Assessment of generalizability of findings
|
| 172 |
+
# 4. Ethical considerations mentioned
|
| 173 |
+
#
|
| 174 |
+
# Present as a comparison table with Strengths and Limitations columns."""
|
| 175 |
+
#
|
| 176 |
+
# # Mini-Prompt 7: Implications and Future Work
|
| 177 |
+
# implications_prompt = """Extract information about implications and future directions:
|
| 178 |
+
#
|
| 179 |
+
# 1. Research implications (how this advances the field)
|
| 180 |
+
# 2. Practical/industry applications of the findings
|
| 181 |
+
# 3. Future research directions identified by the authors
|
| 182 |
+
# 4. Unresolved questions that emerge from this work
|
| 183 |
+
#
|
| 184 |
+
# Provide as 3-4 concise bullet points focusing on significance and future work."""
|
| 185 |
+
#
|
| 186 |
+
# # Mini-Prompt 8: Executive Summary
|
| 187 |
+
# executive_summary_prompt = """Create a concise executive summary of the paper with these components:
|
| 188 |
+
#
|
| 189 |
+
# 1. One-paragraph overview (3-5 sentences)
|
| 190 |
+
# 2. The problem being solved
|
| 191 |
+
# 3. The approach taken
|
| 192 |
+
# 4. The key results
|
| 193 |
+
# 5. Why this matters
|
| 194 |
+
#
|
| 195 |
+
# Keep the entire summary under 250 words for quick reference."""
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
# ================= ================= ================= ================= ================= =================
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
basic_info_prompt = """# Basic Paper Information
|
| 202 |
+
|
| 203 |
+
Generate a concise summary of the paper's essential metadata using the table below. Ensure all details are accurately extracted and easy for researchers to scan. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
|
| 204 |
+
|
| 205 |
+
| Information | Details |
|
| 206 |
+
|-------------------|----------------------------------|
|
| 207 |
+
| **Title** | [Full title of the paper] |
|
| 208 |
+
| **Authors** | [Complete list of authors] |
|
| 209 |
+
| **Publication Venue** | [Journal/Conference, Year] |
|
| 210 |
+
| **Research Field**| [Primary domain or discipline] |
|
| 211 |
+
| **Keywords** | [Relevant terms and topics - use bullet points if multiple] |
|
| 212 |
+
"""
|
| 213 |
+
|
| 214 |
+
research_focus_prompt = """# Core Research Focus
|
| 215 |
+
|
| 216 |
+
Summarize the central aim, problem, contribution, and significance of the paper. Present the information clearly and concisely using bullet points. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
|
| 217 |
+
|
| 218 |
+
* **Research Question:** [What is being investigated? State this clearly.]
|
| 219 |
+
* **Problem Statement:** [What specific gap or issue does the paper address? Be direct.]
|
| 220 |
+
* **Main Contribution:** [What is the core offering, innovation, or finding? Highlight the novelty.]
|
| 221 |
+
* **Significance:** [Why is this research important for the field or practice? Briefly explain the impact.]
|
| 222 |
+
"""
|
| 223 |
+
|
| 224 |
+
abstract_prompt = """# Abstract Summary
|
| 225 |
+
|
| 226 |
+
Break down the paper's abstract into its fundamental components for quick comprehension. Present the information concisely using bullet points. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
|
| 227 |
+
|
| 228 |
+
* **Background:** [Brief context leading to the study]
|
| 229 |
+
* **Problem:** [The specific issue the paper tackles]
|
| 230 |
+
* **Methodology:** [Approach, methods, or techniques used]
|
| 231 |
+
* **Key Findings:** [Main results or discoveries - use sub-bullets if needed]
|
| 232 |
+
* **Conclusion:** [Primary takeaway or implication]
|
| 233 |
+
"""
|
| 234 |
+
|
| 235 |
+
methods_prompt = """# Methodology Summary
|
| 236 |
+
|
| 237 |
+
Describe how the research was conducted, focusing on key aspects like study design, data, techniques, and evaluation. Present the information concisely using bullet points, with sub-bullets for details. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
|
| 238 |
+
|
| 239 |
+
* **Study Design:** [e.g., Experimental, Simulation, Case Study, etc.]
|
| 240 |
+
* **Dataset(s):**
|
| 241 |
+
* Source: [Where the data came from]
|
| 242 |
+
* Size: [Amount of data]
|
| 243 |
+
* Key Characteristics: [Important features or properties]
|
| 244 |
+
* Preprocessing: [Main steps taken to prepare data]
|
| 245 |
+
* **Techniques/Models:** [Specific models, algorithms, or frameworks used - list key ones]
|
| 246 |
+
* **Evaluation:**
|
| 247 |
+
* Metrics: [How performance/success was measured - list key metrics]
|
| 248 |
+
* Setup: [Briefly describe evaluation setup if notable]
|
| 249 |
+
* **Tools & Software:** [Libraries, platforms, hardware specifics if critical]
|
| 250 |
+
"""
|
| 251 |
+
|
| 252 |
+
results_prompt = """# Key Results
|
| 253 |
+
|
| 254 |
+
List and explain the paper's main outcomes and their importance. Use the table for primary findings and bullet points for comparisons to prior work. Keep descriptions and insights brief and impactful. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
|
| 255 |
+
|
| 256 |
+
| Finding # | Description of Result | Significance / Insight |
|
| 257 |
+
|-----------|-------------------------------|----------------------------------|
|
| 258 |
+
| 1 | [What was observed/found?] | [Why this result is important or novel?] |
|
| 259 |
+
| 2 | [Another key result] | [Its implication or contribution] |
|
| 260 |
+
| 3 | [Third main finding] | [What we learn from this] |
|
| 261 |
+
| ... | [Add more rows as needed] | [Corresponding insight] |
|
| 262 |
+
|
| 263 |
+
**Comparison to Prior Work:**
|
| 264 |
+
* [Highlight how these results differ from or improve upon previous research.]
|
| 265 |
+
* [Mention specific previous work if comparison is direct.]
|
| 266 |
+
* [Explain why the improvement or difference matters.]
|
| 267 |
+
"""
|
| 268 |
+
|
| 269 |
+
visuals_prompt = """# Important Figures & Tables
|
| 270 |
+
|
| 271 |
+
Highlight the most critical visualizations and tabular data from the paper. Explain their content and why they are important for understanding the research. Use the table below.
|
| 272 |
+
|
| 273 |
+
| Visual Element | Brief Description | Key Insight or Interpretation |
|
| 274 |
+
|-----------------|-------------------------------|--------------------------------------|
|
| 275 |
+
| **Figure [Number]**| [What the figure depicts or shows] | [What key point or data trend does it illustrate?] |
|
| 276 |
+
| **Table [Number]** | [Summary of data/content in the table] | [What conclusion or comparison can be drawn from this table?] |
|
| 277 |
+
| **Figure [Number]**| [Another key visualization] | [Why is this figure crucial for the results or argument?] |
|
| 278 |
+
| ... | [Add more rows as needed] | [Corresponding insight] |
|
| 279 |
+
"""
|
| 280 |
+
|
| 281 |
+
limitations_prompt = """# Limitations & Future Work
|
| 282 |
+
|
| 283 |
+
Detail the limitations encountered during the research and outline suggested future directions. Use bullet points for both limitations and future work. Be concise. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
|
| 284 |
+
|
| 285 |
+
**Limitations:**
|
| 286 |
+
* **Theoretical:** [Conceptual limits of the approach, with brief impact]
|
| 287 |
+
* **Methodological:** [Issues with design or procedure, with brief impact]
|
| 288 |
+
* **Data-Related:** [Constraints due to data quality/availability, with brief impact]
|
| 289 |
+
* [Add other relevant limitations]
|
| 290 |
+
|
| 291 |
+
**Future Work Suggestions:**
|
| 292 |
+
* [Proposed next steps or improvements to the current work.]
|
| 293 |
+
* [New areas or questions for future research based on these findings.]
|
| 294 |
+
* [Potential experiments or applications to explore.]
|
| 295 |
+
"""
|
| 296 |
+
|
| 297 |
+
contributions_prompt = """# Main Contributions
|
| 298 |
+
|
| 299 |
+
List all major contributions of the paper, categorized by type. Explain how each contribution adds value or novelty to the field. Use bullet points, with sub-bullets for novelty/advancement. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
|
| 300 |
+
|
| 301 |
+
* **Theoretical:** [New framework, concept, or insight introduced]
|
| 302 |
+
* Novelty/Advancement: [How it extends or changes existing theory]
|
| 303 |
+
* **Methodological:** [New method, algorithm, or model developed]
|
| 304 |
+
* Novelty/Advancement: [What makes it different, better, or more efficient?]
|
| 305 |
+
* **Empirical:** [Significant findings or results from experiments/data]
|
| 306 |
+
* Novelty/Advancement: [Why these results matter or what they demonstrate?]
|
| 307 |
+
* **Practical:** [Applications, systems, or tools developed]
|
| 308 |
+
* Novelty/Advancement: [Real-world relevance or utility]
|
| 309 |
+
* [Add other relevant contributions]
|
| 310 |
+
|
| 311 |
+
**Most Noteworthy Contribution:** [Briefly summarize the single biggest impact or most innovative aspect of the paper.]
|
| 312 |
+
"""
|
| 313 |
+
|
| 314 |
+
related_work_prompt = """# Related Work
|
| 315 |
+
|
| 316 |
+
Show how this research fits into the existing landscape of studies and what specific gaps it addresses. Use the table to compare this work to previous approaches and list the addressed gap using bullet points. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
|
| 317 |
+
|
| 318 |
+
| Topic/Area | Previous Approaches | This Paper's Innovation / Difference |
|
| 319 |
+
|-----------------|----------------------------------|----------------------------------------|
|
| 320 |
+
| **[Relevant Area 1]**| [Summary of how prior work handled this] | [What new approach, technique, or finding is introduced here?] |
|
| 321 |
+
| **[Relevant Area 2]**| [Other related methods or studies] | [How does this paper build upon or deviate from them?] |
|
| 322 |
+
| **[Relevant Area 3]**| [Existing theories or models] | [Enhancements, alternatives, or validations provided by this work] |
|
| 323 |
+
| ... | [Add more rows as needed] | [Corresponding innovation] |
|
| 324 |
+
|
| 325 |
+
**Gap Addressed:**
|
| 326 |
+
* [What specific problem, limitation, or missing piece in the existing literature does this paper tackle?]
|
| 327 |
+
"""
|
| 328 |
+
|
| 329 |
+
applications_prompt = """# Practical Applications
|
| 330 |
+
|
| 331 |
+
Explore potential real-world applications of the research findings or methods. Use the table to detail potential use cases, required conditions, and feasibility. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
|
| 332 |
+
|
| 333 |
+
| Domain/Industry | Potential Use Case or Application | Key Requirements or Dependencies | Feasibility/Timeline (e.g., Short/Med/Long term) |
|
| 334 |
+
|-----------------|--------------------------------------|-------------------------------------|-------------------|
|
| 335 |
+
| **[Domain 1]** | [How can the results/methods be used here?] | [What data, technology, or infrastructure is needed?] | [Estimated time to potential deployment] |
|
| 336 |
+
| **[Domain 2]** | [Another potential application area] | [Factors affecting feasibility or adoption] | [Estimated time to potential deployment] |
|
| 337 |
+
| **[Domain 3]** | [Innovative potential application] | [Challenges or conditions for implementation] | [Estimated time to potential deployment] |
|
| 338 |
+
| ... | [Add more rows as needed] | [Corresponding requirements] | [Corresponding timeline] |
|
| 339 |
+
|
| 340 |
+
**Most Promising Use Case:** [Briefly highlight the application with the highest potential impact or feasibility.]
|
| 341 |
+
"""
|
| 342 |
+
|
| 343 |
+
technical_prompt = """# Technical Details
|
| 344 |
+
|
| 345 |
+
Provide a concise summary of the paper's specific technical aspects. Use the table for algorithms, architecture, implementation, and performance. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
|
| 346 |
+
|
| 347 |
+
| Component | Description | Key Configuration or Parameters |
|
| 348 |
+
|-------------------|-------------------------------------|--------------------------------------|
|
| 349 |
+
| **Algorithm(s)** | [What specific algorithm(s) are central?] | [Key hyperparameters, variations used, etc.] |
|
| 350 |
+
| **Model/Architecture**| [Type or design of the model/system] | [Number of layers, components, specific structure details] |
|
| 351 |
+
| **Implementation**| [Languages, key libraries, environment specifics] | [Frameworks used (TensorFlow, PyTorch, etc.), notable dependencies] |
|
| 352 |
+
| **Performance** | [Key performance metrics reported] | [Results achieved (e.g., Accuracy %, F1 score, latency ms)] |
|
| 353 |
+
| ... | [Add more rows as needed] | [Corresponding details] |
|
| 354 |
+
|
| 355 |
+
"""
|
| 356 |
+
|
| 357 |
+
quick_summary_prompt = """# Quick Summary
|
| 358 |
+
|
| 359 |
+
Provide a highly concise summary of the entire paper, suitable for a quick grasp of its core message. Include both a brief paragraph and a single-sentence version. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
|
| 360 |
+
|
| 361 |
+
**Brief Summary (3–5 Sentences):**
|
| 362 |
+
[Write a concise summary covering the paper's motivation, core method, main findings, and overall significance.]
|
| 363 |
+
|
| 364 |
+
**One-Sentence Summary:**
|
| 365 |
+
[Write a single, impactful sentence that captures the paper’s most important contribution or finding.]
|
| 366 |
+
"""
|
| 367 |
+
|
| 368 |
+
reading_guide_prompt = """# Reading Guide
|
| 369 |
+
|
| 370 |
+
Help researchers quickly navigate the paper by highlighting the most important sections and the key information found within them. Suggest an efficient reading path. Use the table for key sections and bullet points for the reading path. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
|
| 371 |
+
|
| 372 |
+
| Section Name | Key Information or Reason to Focus Here |
|
| 373 |
+
|-------------------|------------------------------------|
|
| 374 |
+
| **[Section Name 1]**| [What is the main idea or critical takeaway from this section?] |
|
| 375 |
+
| **[Section Name 2]**| [Why is this section particularly insightful or important for understanding the work?] |
|
| 376 |
+
| **[Section Name 3]**| [What key details or results are presented here?] |
|
| 377 |
+
| ... | [Add more rows as needed] |
|
| 378 |
+
| **[Conclusion Section]**| [Main takeaways and future implications.] |
|
| 379 |
+
|
| 380 |
+
**Recommended Reading Path:**
|
| 381 |
+
* [Suggest an efficient order to read the key sections for maximum understanding (e.g., Abstract -> Introduction -> Methods (key parts) -> Results (key figures/tables) -> Conclusion).]
|
| 382 |
+
"""
|
| 383 |
+
|
| 384 |
+
equations_prompt = """# Key Equations
|
| 385 |
+
|
| 386 |
+
Highlight and explain the major equations presented in the paper. For each equation, describe its purpose, define its variables, and explain its significance to the research. Use the table below. Use LaTeX format ($$...$$ for block, $...$ for inline) for equations. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
|
| 387 |
+
|
| 388 |
+
| Equation | Purpose or Role in the Paper | Why It Matters to the Research |
|
| 389 |
+
|-------------------|-----------------------------------|------------------------------------|
|
| 390 |
+
| $$ [Equation 1] $$ | [What the equation calculates, models, or represents] | [Its role in the method, results, or theory] |
|
| 391 |
+
| $$ [Equation 2] $$ | [Purpose of this equation] | [Its impact on the conclusions or findings] |
|
| 392 |
+
| $$ [Equation 3] $$ | [Purpose of this equation] | [How it supports the overall argument] |
|
| 393 |
+
| ... | [Add more rows as needed] | [Corresponding significance] |
|
| 394 |
+
"""
|
| 395 |
+
|
| 396 |
+
executive_summary_prompt = """# Executive Summary
|
| 397 |
+
|
| 398 |
+
Provide a high-level summary of the paper, tailored for research leads, grant reviewers, or collaborators. Focus on the problem, solution, key results, and implications using concise bullet points. Provide the output in the same language as this prompt. If a specific detail cannot be found, provide an empty string (`""`) for that item or cell, do not use placeholder text.
|
| 399 |
+
|
| 400 |
+
* **Research Problem:** [Clear articulation of the challenge the paper addresses]
|
| 401 |
+
* **Proposed Solution:** [Brief overview of the method, model, or approach introduced]
|
| 402 |
+
* **Major Results:** [Highlights of the most significant findings or achievements - use sub-bullets if needed]
|
| 403 |
+
* **Implications:** [Practical, theoretical, or future impact of the work]
|
| 404 |
+
* **Relevance:** [Why this paper is important and should be paid attention to]
|
| 405 |
+
"""
|
| 406 |
+
|
| 407 |
+
# ================= ================= ================= ================= ================= =================
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
# 1. Basic Paper Information
|
| 413 |
+
# basic_info_prompt = """# Basic Paper Information
|
| 414 |
+
#
|
| 415 |
+
# Extract all essential metadata to clearly identify and classify the research paper. Focus on accurately capturing the publication details.
|
| 416 |
+
#
|
| 417 |
+
# | Information | Details |
|
| 418 |
+
# |-------------------|----------------------------------|
|
| 419 |
+
# | Title | [Full title of the paper] |
|
| 420 |
+
# | Authors | [Complete list of authors] |
|
| 421 |
+
# | Publication Venue | [Journal/Conference, Year] |
|
| 422 |
+
# | Research Field | [Primary domain or discipline] |
|
| 423 |
+
# | Keywords | [Relevant terms and topics] |
|
| 424 |
+
# """
|
| 425 |
+
#
|
| 426 |
+
# # 2. Core Research Focus
|
| 427 |
+
# research_focus_prompt = """# Core Research Focus
|
| 428 |
+
#
|
| 429 |
+
# Summarize the central aim of the paper. Clearly articulate the main question, the addressed problem, and the novelty of the contribution.
|
| 430 |
+
#
|
| 431 |
+
# | Element | Details |
|
| 432 |
+
# |----------------------|-------------------------------------------------|
|
| 433 |
+
# | Research Question | [What is being investigated?] |
|
| 434 |
+
# | Problem Statement | [What gap or issue does the paper address?] |
|
| 435 |
+
# | Main Contribution | [What is the core offering or innovation?] |
|
| 436 |
+
# | Significance | [Why is this research important?] |
|
| 437 |
+
# """
|
| 438 |
+
#
|
| 439 |
+
# # 3. Abstract Summary
|
| 440 |
+
# abstract_prompt = """# Abstract Summary
|
| 441 |
+
#
|
| 442 |
+
# Break down the abstract into its fundamental components for easier comprehension.
|
| 443 |
+
#
|
| 444 |
+
# | Component | Details |
|
| 445 |
+
# |---------------|----------------------------------------|
|
| 446 |
+
# | Background | [Brief context of the study] |
|
| 447 |
+
# | Research Problem | [Specific issue the paper solves] |
|
| 448 |
+
# | Methodology | [Approach or technique used] |
|
| 449 |
+
# | Key Findings | [Main results or discoveries] |
|
| 450 |
+
# | Conclusion | [Primary takeaway from the study] |
|
| 451 |
+
# """
|
| 452 |
+
#
|
| 453 |
+
#
|
| 454 |
+
# # 4. Methodology Summary
|
| 455 |
+
# methods_prompt = """# Methodology Summary
|
| 456 |
+
#
|
| 457 |
+
# Describe how the research was conducted, including data, tools, and procedures.
|
| 458 |
+
#
|
| 459 |
+
# | Component | Details |
|
| 460 |
+
# |------------------|-------------------------------------------|
|
| 461 |
+
# | Study Design | [Experimental, simulation, case study etc.] |
|
| 462 |
+
# | Dataset | [Source, size, preprocessing, etc.] |
|
| 463 |
+
# | Techniques Used | [Models, algorithms, or frameworks used] |
|
| 464 |
+
# | Evaluation Metrics| [How success or performance was measured]|
|
| 465 |
+
# | Tools & Software | [Libraries, platforms, hardware specifics]|
|
| 466 |
+
# """
|
| 467 |
+
#
|
| 468 |
+
#
|
| 469 |
+
# # 5. Key Results
|
| 470 |
+
# results_prompt = """# Key Results
|
| 471 |
+
#
|
| 472 |
+
# List and explain the main outcomes, their impact, and how they compare to past work.
|
| 473 |
+
#
|
| 474 |
+
# | Finding # | Description of Result | Significance / Insight |
|
| 475 |
+
# |-----------|-------------------------------|----------------------------------|
|
| 476 |
+
# | 1 | [What was observed] | [Why it matters] |
|
| 477 |
+
# | 2 | [What was observed] | [Why it matters] |
|
| 478 |
+
# | 3 | [What was observed] | [Why it matters] |
|
| 479 |
+
#
|
| 480 |
+
# **Comparison to Prior Work:** [Highlight how these results differ or improve upon previous research.]
|
| 481 |
+
# """
|
| 482 |
+
#
|
| 483 |
+
#
|
| 484 |
+
# # 6. Important Figures & Tables
|
| 485 |
+
# visuals_prompt = """# Important Figures & Tables
|
| 486 |
+
#
|
| 487 |
+
# Highlight the most critical visualizations and tabular data, explaining their importance.
|
| 488 |
+
#
|
| 489 |
+
# | Figure/Table | Description | Insight or Interpretation |
|
| 490 |
+
# |--------------|----------------------------------|--------------------------------------|
|
| 491 |
+
# | Figure 1 | [What it shows] | [Why it's important] |
|
| 492 |
+
# | Table 2 | [Data/content summary] | [What we learn from it] |
|
| 493 |
+
# | Figure 3 | [Trend or structure depicted] | [Significance to conclusions] |
|
| 494 |
+
# """
|
| 495 |
+
#
|
| 496 |
+
#
|
| 497 |
+
# # 7. Limitations & Future Work
|
| 498 |
+
# limitations_prompt = """# Limitations & Future Work
|
| 499 |
+
#
|
| 500 |
+
# Detail the limitations encountered in the research and outline proposed future directions.
|
| 501 |
+
#
|
| 502 |
+
# | Type | Limitation Description | Potential Impact |
|
| 503 |
+
# |----------------|----------------------------------|--------------------------------------|
|
| 504 |
+
# | Theoretical | [Conceptual limits] | [Effect on validity/generalizability]|
|
| 505 |
+
# | Methodological | [Design or procedure issues] | [Effect on robustness] |
|
| 506 |
+
# | Data-Related | [Data quality, availability] | [Effect on conclusions] |
|
| 507 |
+
#
|
| 508 |
+
# **Future Work Suggestions:**
|
| 509 |
+
# - [Proposed improvement or next step]
|
| 510 |
+
# - [New areas to explore]
|
| 511 |
+
# - [Potential experiments or applications]
|
| 512 |
+
# """
|
| 513 |
+
#
|
| 514 |
+
#
|
| 515 |
+
# # 8. Main Contributions
|
| 516 |
+
# contributions_prompt = """# Main Contributions
|
| 517 |
+
#
|
| 518 |
+
# List all major contributions by type, and explain how each adds value.
|
| 519 |
+
#
|
| 520 |
+
# | Category | Contribution Summary | Novelty or Advancement |
|
| 521 |
+
# |----------------|----------------------------------|----------------------------------------|
|
| 522 |
+
# | Theoretical | [New framework or insight] | [How it extends theory] |
|
| 523 |
+
# | Methodological | [New method/model] | [What makes it different or better] |
|
| 524 |
+
# | Empirical | [Results from data/experiments] | [Why they matter] |
|
| 525 |
+
# | Practical | [Applications or systems] | [Real-world relevance] |
|
| 526 |
+
#
|
| 527 |
+
# **Most Noteworthy Contribution:** [Summarize the biggest impact of the paper]
|
| 528 |
+
# """
|
| 529 |
+
#
|
| 530 |
+
# # 9. Related Work
|
| 531 |
+
# related_work_prompt = """# Related Work
|
| 532 |
+
#
|
| 533 |
+
# Show how this research fits into the existing landscape and what gaps it fills.
|
| 534 |
+
#
|
| 535 |
+
# | Topic/Area | Previous Approaches | This Paper's Innovation |
|
| 536 |
+
# |------------------|----------------------------------|----------------------------------------|
|
| 537 |
+
# | Area 1 | [Summary of prior methods] | [What’s new in this work] |
|
| 538 |
+
# | Area 2 | [Prior attempts or models] | [Improvements or alternatives] |
|
| 539 |
+
# | Area 3 | [Old techniques or theories] | [Enhancements introduced here] |
|
| 540 |
+
#
|
| 541 |
+
# **Gap Addressed:** [What missing element or inefficiency this paper tackles]
|
| 542 |
+
# """
|
| 543 |
+
#
|
| 544 |
+
# # 10. Practical Applications
|
| 545 |
+
# applications_prompt = """# Practical Applications
|
| 546 |
+
#
|
| 547 |
+
# Explore how the research can be applied in real-world domains.
|
| 548 |
+
#
|
| 549 |
+
# | Domain/Industry | Use Case or Application | Requirements or Dependencies | Expected Timeline |
|
| 550 |
+
# |------------------|----------------------------------|-------------------------------------|-------------------|
|
| 551 |
+
# | Domain 1 | [What can be done] | [Tech, data, adoption needs] | [Short/Med/Long] |
|
| 552 |
+
# | Domain 2 | [Another use case] | [Feasibility factors] | [Short/Med/Long] |
|
| 553 |
+
# | Domain 3 | [Innovative potential] | [Deployment conditions] | [Short/Med/Long] |
|
| 554 |
+
#
|
| 555 |
+
# **Most Promising Use Case:** [Brief highlight of top application potential]
|
| 556 |
+
# """
|
| 557 |
+
#
|
| 558 |
+
#
|
| 559 |
+
# # 11. Technical Details
|
| 560 |
+
# technical_prompt = """# Technical Details
|
| 561 |
+
#
|
| 562 |
+
# Dive into the specific technical aspects, including algorithms, architecture, and implementation details.
|
| 563 |
+
#
|
| 564 |
+
# | Component | Description | Configuration or Parameters |
|
| 565 |
+
# |---------------|-------------------------------------|--------------------------------------|
|
| 566 |
+
# | Algorithm | [What algorithm is used] | [Hyperparameters, version etc.] |
|
| 567 |
+
# | Model/Architecture | [Type or design used] | [Layers, connections, components] |
|
| 568 |
+
# | Implementation| [Languages, packages, environment] | [Frameworks, hardware specifics] |
|
| 569 |
+
# | Performance | [Observed performance] | [Accuracy, latency, etc.] |
|
| 570 |
+
#
|
| 571 |
+
# **Code Repository:** [Link if available or mention if not provided]
|
| 572 |
+
# """
|
| 573 |
+
#
|
| 574 |
+
#
|
| 575 |
+
# # 12. Quick Summary
|
| 576 |
+
# quick_summary_prompt = """# Quick Summary
|
| 577 |
+
#
|
| 578 |
+
# Provide an overview of the entire paper in both concise and single-sentence formats.
|
| 579 |
+
#
|
| 580 |
+
# **Brief Summary (3–5 Sentences):**
|
| 581 |
+
# [Include motivation, methodology, findings, and significance.]
|
| 582 |
+
#
|
| 583 |
+
# **One-Sentence Summary:**
|
| 584 |
+
# [A compact summary capturing the paper’s core message.]
|
| 585 |
+
# """
|
| 586 |
+
#
|
| 587 |
+
#
|
| 588 |
+
# # 13. Reading Guide
|
| 589 |
+
# reading_guide_prompt = """# Reading Guide
|
| 590 |
+
#
|
| 591 |
+
# Help readers focus on the most insightful sections.
|
| 592 |
+
#
|
| 593 |
+
# | Section Name | Key Information or Reason to Read |
|
| 594 |
+
# |-------------------|------------------------------------|
|
| 595 |
+
# | [Section A] | [Main idea or takeaway] |
|
| 596 |
+
# | [Section B] | [Core implementation detail] |
|
| 597 |
+
# | [Section C] | [Critical results or discussion] |
|
| 598 |
+
#
|
| 599 |
+
# **Recommended Reading Path:** [Suggestion for efficient reading – e.g., skip intro, read methods, then results]
|
| 600 |
+
# """
|
| 601 |
+
#
|
| 602 |
+
#
|
| 603 |
+
# # 14. Key Equations
|
| 604 |
+
# equations_prompt = """# Key Equations
|
| 605 |
+
#
|
| 606 |
+
# Highlight and explain major equations in the paper.
|
| 607 |
+
#
|
| 608 |
+
# | Equation | Purpose | Variable Explanation | Why It Matters |
|
| 609 |
+
# |------------------|-----------------------------------|---------------------------------------|------------------------------------|
|
| 610 |
+
# | [Equation 1] | [What it calculates or models] | [Define each term] | [Its role in the paper] |
|
| 611 |
+
# | [Equation 2] | [Purpose] | [Define each term] | [Impact on method/results] |
|
| 612 |
+
# | [Equation 3] | [Purpose] | [Define each term] | [How it supports the conclusions] |
|
| 613 |
+
# """
|
| 614 |
+
#
|
| 615 |
+
# # 15. Executive Summary
|
| 616 |
+
# executive_summary_prompt = """# Executive Summary
|
| 617 |
+
#
|
| 618 |
+
# Offer a high-level summary tailored for research leads, grant reviewers, or collaborators.
|
| 619 |
+
#
|
| 620 |
+
# | Section | Description |
|
| 621 |
+
# |----------------|--------------------------------------|
|
| 622 |
+
# | Research Problem | [Clear articulation of the challenge] |
|
| 623 |
+
# | Proposed Solution| [Brief on method/model introduced] |
|
| 624 |
+
# | Major Results | [Highlights of key findings] |
|
| 625 |
+
# | Implications | [Practical, theoretical impact] |
|
| 626 |
+
# | Relevance | [Why this paper should be read] |
|
| 627 |
+
# """
|
app/summarization/summarizer.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
import cohere
|
| 4 |
+
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
|
| 5 |
+
from ..config.settings import LLM_MODEL
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class DocumentSummarizer:
|
| 9 |
+
def __init__(self, retriever, batch_size=4):
|
| 10 |
+
self.batch_size = batch_size
|
| 11 |
+
self.retriever = retriever # Store the retriever here
|
| 12 |
+
|
| 13 |
+
self.cohere_client = cohere.ClientV2()
|
| 14 |
+
|
| 15 |
+
self.components = {
|
| 16 |
+
'basic_info': "Basic Paper Information",
|
| 17 |
+
'abstract': "Abstract Summary",
|
| 18 |
+
'methods': "Methodology Summary",
|
| 19 |
+
'results': "Key Results",
|
| 20 |
+
'limitations': "Limitations & Future Work",
|
| 21 |
+
'related_work': "Related Work",
|
| 22 |
+
'applications': "Practical Applications",
|
| 23 |
+
'technical': "Technical Details",
|
| 24 |
+
'equations': "Key Equations",
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
self.prompts = self._initialize_prompts()
|
| 28 |
+
|
| 29 |
+
# Validate prompts dictionary matches components
|
| 30 |
+
# This helps catch missing prompts or components
|
| 31 |
+
missing_prompts = [comp for comp in self.components if comp not in self.prompts]
|
| 32 |
+
if missing_prompts:
|
| 33 |
+
print(f"Warning: No prompts found for components: {missing_prompts}")
|
| 34 |
+
missing_components = [prompt_key for prompt_key in self.prompts if prompt_key not in self.components]
|
| 35 |
+
if missing_components:
|
| 36 |
+
print(f"Warning: Prompts found for components not in self.components: {missing_components}")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _initialize_prompts(self):
|
| 40 |
+
# It's better to explicitly import what you need
|
| 41 |
+
from ..summarization.prompt2 import (
|
| 42 |
+
basic_info_prompt, abstract_prompt,
|
| 43 |
+
methods_prompt, results_prompt, visuals_prompt, limitations_prompt,
|
| 44 |
+
contributions_prompt, related_work_prompt, applications_prompt,
|
| 45 |
+
technical_prompt, quick_summary_prompt, reading_guide_prompt, # quick_summary & reading_guide prompts might be needed
|
| 46 |
+
equations_prompt
|
| 47 |
+
)
|
| 48 |
+
return {
|
| 49 |
+
'basic_info': basic_info_prompt,
|
| 50 |
+
'abstract': abstract_prompt,
|
| 51 |
+
'methods': methods_prompt,
|
| 52 |
+
'results': results_prompt,
|
| 53 |
+
'limitations': limitations_prompt,
|
| 54 |
+
'related_work': related_work_prompt,
|
| 55 |
+
'applications': applications_prompt,
|
| 56 |
+
'technical': technical_prompt,
|
| 57 |
+
'equations': equations_prompt,
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
def summarize_text(self, documents: List[Dict], prompt: str, language: str):
|
| 61 |
+
"""
|
| 62 |
+
Summarizes the provided documents using the given prompt and language
|
| 63 |
+
via the Cohere Chat API.
|
| 64 |
+
"""
|
| 65 |
+
if not documents:
|
| 66 |
+
print("Warning: No documents provided for summarization.")
|
| 67 |
+
return None # Or an empty string, depending on desired behavior
|
| 68 |
+
|
| 69 |
+
# Use the initialized client
|
| 70 |
+
try:
|
| 71 |
+
response = self.cohere_client.chat(
|
| 72 |
+
model=LLM_MODEL,
|
| 73 |
+
documents=documents, # Pass the list of dicts directly
|
| 74 |
+
messages=[
|
| 75 |
+
{"role": "system", "content": f"You are an expert summarization AI. Please respond in {language}."},
|
| 76 |
+
{"role": "user", "content": f"{prompt}"}
|
| 77 |
+
],
|
| 78 |
+
)
|
| 79 |
+
if response and response.message and response.message.content and response.message.content[0] and response.message.content[0].text:
|
| 80 |
+
return response.message.content[0].text
|
| 81 |
+
else:
|
| 82 |
+
print(f"Warning: Unexpected API response structure for prompt: {prompt[:50]}...")
|
| 83 |
+
return None
|
| 84 |
+
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"Error during Cohere API call: {e}")
|
| 87 |
+
return None
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def extract_relevant_documents(self, component: str, filename: str, chunk_size: int):
|
| 91 |
+
"""
|
| 92 |
+
Extracts relevant documents for a specific component from the retriever.
|
| 93 |
+
"""
|
| 94 |
+
query = f"Analyze the {self.components.get(component, component)} section from the document titled '{filename}'."
|
| 95 |
+
# Use the retriever stored in self.
|
| 96 |
+
# Pass the chunk_size parameter correctly
|
| 97 |
+
try:
|
| 98 |
+
documents = self.retriever.get_relevant_docs(
|
| 99 |
+
chromdb_query=query,
|
| 100 |
+
rerank_query=query,
|
| 101 |
+
filter={'filename': filename},
|
| 102 |
+
chunk_size=chunk_size
|
| 103 |
+
)
|
| 104 |
+
return documents
|
| 105 |
+
except Exception as e:
|
| 106 |
+
print(f"Error during document retrieval for component {component}: {e}")
|
| 107 |
+
return []
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def summerize_document(self, filename: str, language: str, chunk_size: int):
|
| 111 |
+
"""
|
| 112 |
+
Summarizes a document by processing each component in parallel.
|
| 113 |
+
"""
|
| 114 |
+
start_total = time.time()
|
| 115 |
+
components = list(self.components.keys())
|
| 116 |
+
results = {}
|
| 117 |
+
errors = {} # Track errors
|
| 118 |
+
|
| 119 |
+
def process_component(comp):
|
| 120 |
+
comp_start = time.time()
|
| 121 |
+
print(f"Starting processing for component: {comp}")
|
| 122 |
+
try:
|
| 123 |
+
document_chunks = self.extract_relevant_documents(comp, filename, chunk_size)
|
| 124 |
+
|
| 125 |
+
if not document_chunks:
|
| 126 |
+
print(f"No documents found for component: {comp} for file {filename}")
|
| 127 |
+
return comp, None, f"No documents found"
|
| 128 |
+
|
| 129 |
+
prompt = self.prompts.get(comp)
|
| 130 |
+
if not prompt:
|
| 131 |
+
print(f"No prompt defined for component: {comp}")
|
| 132 |
+
return comp, None, f"No prompt defined"
|
| 133 |
+
|
| 134 |
+
# Summarize the retrieved documents
|
| 135 |
+
summary = self.summarize_text(document_chunks, prompt, language)
|
| 136 |
+
|
| 137 |
+
comp_end = time.time()
|
| 138 |
+
print(f"Finished processing for component: {comp}. Time taken: {comp_end - comp_start:.2f} seconds")
|
| 139 |
+
return comp, summary, None # Return comp, result, error
|
| 140 |
+
|
| 141 |
+
except Exception as e:
|
| 142 |
+
comp_end = time.time()
|
| 143 |
+
print(f"Error processing component {comp}: {e}. Time taken: {comp_end - comp_start:.2f} seconds")
|
| 144 |
+
return comp, None, str(e) # Return comp, result, error
|
| 145 |
+
|
| 146 |
+
# Use ThreadPoolExecutor for I/O-bound tasks (API calls)
|
| 147 |
+
# max_workers=None uses a default appropriate for the system
|
| 148 |
+
with ThreadPoolExecutor(max_workers=None) as executor:
|
| 149 |
+
# Submit all component tasks
|
| 150 |
+
future_to_component = {executor.submit(process_component, comp): comp for comp in components}
|
| 151 |
+
|
| 152 |
+
# Process results as they complete
|
| 153 |
+
for future in as_completed(future_to_component):
|
| 154 |
+
comp = future_to_component[future]
|
| 155 |
+
try:
|
| 156 |
+
comp_name, result, error = future.result()
|
| 157 |
+
if result is not None:
|
| 158 |
+
results[comp_name] = result
|
| 159 |
+
elif error:
|
| 160 |
+
errors[comp_name] = error
|
| 161 |
+
|
| 162 |
+
except Exception as exc:
|
| 163 |
+
# This catches exceptions *within* the future's result retrieval, less common
|
| 164 |
+
print(f'{comp} generated an exception: {exc}')
|
| 165 |
+
errors[comp] = str(exc)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
end_total = time.time()
|
| 169 |
+
print(f"\n--- Total summarization time for {filename}: {end_total - start_total:.2f} seconds ---\n")
|
| 170 |
+
|
| 171 |
+
# You might want to return both results and errors
|
| 172 |
+
# For simplicity, let's just compile the available results for now
|
| 173 |
+
compiled = self.compile_summary(filename, results)
|
| 174 |
+
# Optionally, add errors to the compiled output or return them separately
|
| 175 |
+
if errors:
|
| 176 |
+
print(f"Components that failed or returned no data: {list(errors.keys())}")
|
| 177 |
+
# You could append error messages to the compiled summary
|
| 178 |
+
# compiled += "\n\n## Processing Errors\n" + "\n".join([f"- {k}: {v}" for k, v in errors.items()])
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
return compiled
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def compile_summary(self, filename: str, results: Dict[str, str]) -> str:
|
| 185 |
+
"""
|
| 186 |
+
Compiles a summary for a document by concatenating the results of all requested components.
|
| 187 |
+
Orders sections according to a predefined list.
|
| 188 |
+
"""
|
| 189 |
+
# Include all components that might have results, maintaining desired order
|
| 190 |
+
sections_order = [
|
| 191 |
+
'basic_info', 'abstract',
|
| 192 |
+
'methods', 'results', 'equations', 'technical',
|
| 193 |
+
'related_work', 'applications', 'limitations'
|
| 194 |
+
]
|
| 195 |
+
|
| 196 |
+
lines = [f"# Summary of {filename}", f"Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"]
|
| 197 |
+
for section in sections_order:
|
| 198 |
+
# Only add a section if it was processed and returned a result
|
| 199 |
+
if section in results and results[section]:
|
| 200 |
+
# Use .get with a default in case a component was added to results
|
| 201 |
+
# but not self.components (though validate init helps prevent this)
|
| 202 |
+
title = self.components.get(section, section).title()
|
| 203 |
+
lines.append(f"## {title}\n") # Use ## for subheadings
|
| 204 |
+
lines.append(f"{results[section]}\n")
|
| 205 |
+
|
| 206 |
+
return "\n".join(lines)
|
app/utils/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility package.
|
| 3 |
+
"""
|
app/utils/enviornments.py
ADDED
|
File without changes
|
app/utils/progress_tracker.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Progress monitoring utilities for tracking parallel processing.
|
| 3 |
+
"""
|
| 4 |
+
import time
|
| 5 |
+
import threading
|
| 6 |
+
import logging
|
| 7 |
+
from typing import Dict, List, Any, Set
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class ProgressTracker:
|
| 13 |
+
"""
|
| 14 |
+
Tracks progress of parallel document processing tasks.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
def __init__(self, total_documents, update_interval=5):
|
| 18 |
+
"""
|
| 19 |
+
Initialize the progress tracker.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
total_documents: Total number of documents to process
|
| 23 |
+
update_interval: How often to log updates (in seconds)
|
| 24 |
+
"""
|
| 25 |
+
self.total = total_documents
|
| 26 |
+
self.completed = 0
|
| 27 |
+
self.failed = 0
|
| 28 |
+
self.in_progress = 0
|
| 29 |
+
self.processed_files = set()
|
| 30 |
+
self.update_interval = update_interval
|
| 31 |
+
self.lock = threading.Lock()
|
| 32 |
+
self.start_time = time.time()
|
| 33 |
+
self.monitor_thread = None
|
| 34 |
+
self.stop_monitoring = threading.Event()
|
| 35 |
+
|
| 36 |
+
def mark_started(self, filename):
|
| 37 |
+
"""Mark a document as being processed."""
|
| 38 |
+
with self.lock:
|
| 39 |
+
self.in_progress += 1
|
| 40 |
+
logger.info(f"Started processing: {filename}")
|
| 41 |
+
|
| 42 |
+
def mark_completed(self, filename, success=True):
|
| 43 |
+
"""Mark a document as completed."""
|
| 44 |
+
with self.lock:
|
| 45 |
+
self.in_progress -= 1
|
| 46 |
+
if filename not in self.processed_files:
|
| 47 |
+
self.processed_files.add(filename)
|
| 48 |
+
if success:
|
| 49 |
+
self.completed += 1
|
| 50 |
+
else:
|
| 51 |
+
self.failed += 1
|
| 52 |
+
|
| 53 |
+
def get_stats(self):
|
| 54 |
+
"""Get current processing statistics."""
|
| 55 |
+
with self.lock:
|
| 56 |
+
elapsed = time.time() - self.start_time
|
| 57 |
+
remaining = self.total - (self.completed + self.failed)
|
| 58 |
+
|
| 59 |
+
# Calculate estimated time remaining
|
| 60 |
+
if self.completed > 0:
|
| 61 |
+
avg_time_per_doc = elapsed / self.completed
|
| 62 |
+
est_remaining = avg_time_per_doc * remaining
|
| 63 |
+
else:
|
| 64 |
+
est_remaining = None
|
| 65 |
+
|
| 66 |
+
return {
|
| 67 |
+
'total': self.total,
|
| 68 |
+
'completed': self.completed,
|
| 69 |
+
'failed': self.failed,
|
| 70 |
+
'in_progress': self.in_progress,
|
| 71 |
+
'remaining': remaining,
|
| 72 |
+
'elapsed_seconds': elapsed,
|
| 73 |
+
'estimated_remaining_seconds': est_remaining
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
def _format_time(self, seconds):
|
| 77 |
+
"""Format seconds as HH:MM:SS."""
|
| 78 |
+
if seconds is None:
|
| 79 |
+
return "unknown"
|
| 80 |
+
|
| 81 |
+
hours, remainder = divmod(int(seconds), 3600)
|
| 82 |
+
minutes, seconds = divmod(remainder, 60)
|
| 83 |
+
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
|
| 84 |
+
|
| 85 |
+
def _monitor_progress(self):
|
| 86 |
+
"""Monitor and log progress periodically."""
|
| 87 |
+
while not self.stop_monitoring.is_set():
|
| 88 |
+
stats = self.get_stats()
|
| 89 |
+
|
| 90 |
+
logger.info(
|
| 91 |
+
f"Progress: {stats['completed']}/{stats['total']} completed, "
|
| 92 |
+
f"{stats['failed']} failed, {stats['in_progress']} in progress | "
|
| 93 |
+
f"Elapsed: {self._format_time(stats['elapsed_seconds'])} | "
|
| 94 |
+
f"Est. remaining: {self._format_time(stats['estimated_remaining_seconds'])}"
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Check if we're done
|
| 98 |
+
if stats['completed'] + stats['failed'] >= stats['total']:
|
| 99 |
+
logger.info("All documents processed!")
|
| 100 |
+
break
|
| 101 |
+
|
| 102 |
+
# Wait for next update
|
| 103 |
+
self.stop_monitoring.wait(self.update_interval)
|
| 104 |
+
|
| 105 |
+
def start_monitoring(self):
|
| 106 |
+
"""Start background monitoring thread."""
|
| 107 |
+
self.monitor_thread = threading.Thread(target=self._monitor_progress)
|
| 108 |
+
self.monitor_thread.daemon = True
|
| 109 |
+
self.monitor_thread.start()
|
| 110 |
+
|
| 111 |
+
def stop(self):
|
| 112 |
+
"""Stop the monitoring thread and report final results."""
|
| 113 |
+
if self.monitor_thread and self.monitor_thread.is_alive():
|
| 114 |
+
self.stop_monitoring.set()
|
| 115 |
+
self.monitor_thread.join(timeout=2.0)
|
| 116 |
+
|
| 117 |
+
# Log final statistics
|
| 118 |
+
stats = self.get_stats()
|
| 119 |
+
logger.info(
|
| 120 |
+
f"Final results: {stats['completed']}/{stats['total']} completed, "
|
| 121 |
+
f"{stats['failed']} failed | "
|
| 122 |
+
f"Total time: {self._format_time(stats['elapsed_seconds'])}"
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
success_rate = (stats['completed'] / stats['total']) * 100 if stats['total'] > 0 else 0
|
| 126 |
+
logger.info(f"Success rate: {success_rate:.2f}%")
|
| 127 |
+
|
| 128 |
+
return stats
|
main.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging # Import logging
|
| 2 |
+
import os
|
| 3 |
+
import tempfile
|
| 4 |
+
import time
|
| 5 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 6 |
+
from typing import List, Dict, Any, Tuple
|
| 7 |
+
|
| 8 |
+
from app.config.settings import DOCS_FOLDER
|
| 9 |
+
# Import classes from the renamed modules
|
| 10 |
+
from app.document_processing.extractors import DocumentProcessorAdapter
|
| 11 |
+
from app.retrieval.vector_store import Retriever
|
| 12 |
+
from app.summarization.output import SummaryOutputManager
|
| 13 |
+
from app.summarization.summarizer import DocumentSummarizer
|
| 14 |
+
|
| 15 |
+
# Configure logging for the main script
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def process_uploaded_files(uploaded_files) -> List[Dict[str, Any]]:
|
| 21 |
+
"""
|
| 22 |
+
Processes a list of files uploaded via Streamlit.
|
| 23 |
+
Saves them temporarily into a folder and uses the DocumentProcessorAdapter
|
| 24 |
+
to process that folder.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
uploaded_files: List of Streamlit UploadedFile objects.
|
| 28 |
+
Type hint is omitted here to avoid needing Streamlit import at top level.
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
List of dictionaries with original extraction results, including chunk_size.
|
| 32 |
+
"""
|
| 33 |
+
# Import streamlit here, as it's used for st.warning
|
| 34 |
+
import streamlit as st
|
| 35 |
+
|
| 36 |
+
start_time = time.time()
|
| 37 |
+
logger.info(f"Starting processing for {len(uploaded_files)} uploaded files.")
|
| 38 |
+
|
| 39 |
+
# Create a temporary directory to save uploaded files
|
| 40 |
+
# This directory will be automatically cleaned up when the 'with' block exits
|
| 41 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 42 |
+
logger.info(f"Using temporary directory: {tmpdir}")
|
| 43 |
+
# Save all uploaded files into the temporary directory
|
| 44 |
+
for uploaded_file in uploaded_files:
|
| 45 |
+
# Create a safe path within the temporary directory
|
| 46 |
+
# Use uploaded_file.name directly, tempfile handles uniqueness if needed
|
| 47 |
+
file_path = os.path.join(tmpdir, uploaded_file.name)
|
| 48 |
+
# Write the file content to the temporary path
|
| 49 |
+
try:
|
| 50 |
+
with open(file_path, "wb") as f:
|
| 51 |
+
f.write(uploaded_file.getvalue())
|
| 52 |
+
logger.debug(f"Saved uploaded file '{uploaded_file.name}' to '{file_path}'")
|
| 53 |
+
except Exception as e:
|
| 54 |
+
logger.error(f"Error saving uploaded file '{uploaded_file.name}' to temporary directory: {e}", exc_info=True)
|
| 55 |
+
# Log a warning in Streamlit if a file couldn't be saved
|
| 56 |
+
st.warning(f"Could not save uploaded file '{uploaded_file.name}' temporarily. It will be skipped.")
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# Use the DocumentProcessorAdapter to process the entire temporary folder
|
| 60 |
+
processor = DocumentProcessorAdapter() # Corrected typo here
|
| 61 |
+
# Call process_folder with the temporary directory path
|
| 62 |
+
extraction_results = processor.process_folder(tmpdir)
|
| 63 |
+
# The process_folder method returns the list of extraction results
|
| 64 |
+
|
| 65 |
+
end_time = time.time()
|
| 66 |
+
logger.info(f"Finished processing uploaded files in {end_time - start_time:.2f} seconds.")
|
| 67 |
+
# The extraction_results list now contains dictionaries with 'filename', 'text', 'error', etc.
|
| 68 |
+
return extraction_results
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def setup_retrieval_system(extraction_results: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], Retriever]:
|
| 72 |
+
"""
|
| 73 |
+
Sets up the retrieval system (vector store) from extraction results.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
extraction_results: List of dictionaries from document extraction.
|
| 77 |
+
Should contain 'filename' and 'text'.
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
A tuple containing:
|
| 81 |
+
- The updated extraction_results list (with 'chunk_size' populated by Retriever).
|
| 82 |
+
- An initialized Retriever instance.
|
| 83 |
+
"""
|
| 84 |
+
start_time = time.time()
|
| 85 |
+
logger.info("Setting up retrieval system.")
|
| 86 |
+
try:
|
| 87 |
+
retriever = Retriever()
|
| 88 |
+
# create_from_documents takes extraction results, chunks text, embeds, and builds the DB.
|
| 89 |
+
# It also updates the extraction_results list with the 'chunk_size' for each document.
|
| 90 |
+
updated_extraction_results = retriever.create_from_documents(extraction_results)
|
| 91 |
+
end_time = time.time()
|
| 92 |
+
logger.info(f"Retriever setup complete in {end_time - start_time:.2f} seconds.")
|
| 93 |
+
return updated_extraction_results, retriever
|
| 94 |
+
except Exception as e:
|
| 95 |
+
end_time = time.time()
|
| 96 |
+
logger.error(f"Error during retrieval system setup: {e}", exc_info=True)
|
| 97 |
+
# If retrieval setup fails, the summarization cannot proceed.
|
| 98 |
+
# Re-raise the exception so the Streamlit app can catch and display it.
|
| 99 |
+
raise
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def summarize_extracted_documents(extraction_results: List[Dict[str, Any]], retriever: Retriever) -> List[Dict[str, Any]]:
|
| 103 |
+
"""
|
| 104 |
+
Summarizes documents based on extraction results and a configured retriever.
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
extraction_results: List of dictionaries from document extraction (should include chunk_size
|
| 108 |
+
populated by setup_retrieval_system).
|
| 109 |
+
retriever: An initialized Retriever instance.
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
A list of dictionaries, each containing the summary result for a file.
|
| 113 |
+
Each dictionary includes:
|
| 114 |
+
- 'filename': The name of the file.
|
| 115 |
+
- 'success': Boolean indicating if summarization was successful.
|
| 116 |
+
- 'summary': The generated summary string (if successful), or None.
|
| 117 |
+
- 'error': An error message string (if not successful), or None.
|
| 118 |
+
- 'processing_time': Time taken for summarization of this file.
|
| 119 |
+
"""
|
| 120 |
+
start_time = time.time()
|
| 121 |
+
logger.info(f"Starting summarization for {len(extraction_results)} documents.")
|
| 122 |
+
|
| 123 |
+
# Initialize the summarizer with the retriever
|
| 124 |
+
summarizer = DocumentSummarizer(retriever)
|
| 125 |
+
|
| 126 |
+
results = [] # List to store results for each document
|
| 127 |
+
|
| 128 |
+
# Filter out results that failed extraction or have no text/chunks
|
| 129 |
+
# Summarization requires extracted text and successful chunking (chunk_size > 0)
|
| 130 |
+
summarizable_results = [
|
| 131 |
+
res for res in extraction_results
|
| 132 |
+
if res.get('text') and res.get('chunk_size', 0) > 0 and res.get('error') is None
|
| 133 |
+
]
|
| 134 |
+
skipped_results = [
|
| 135 |
+
res for res in extraction_results
|
| 136 |
+
if res not in summarizable_results
|
| 137 |
+
]
|
| 138 |
+
|
| 139 |
+
if skipped_results:
|
| 140 |
+
logger.warning(f"Skipping summarization for {len(skipped_results)} files due to extraction errors or no text/chunks.")
|
| 141 |
+
for res in skipped_results:
|
| 142 |
+
# Add entries for skipped files to the results list
|
| 143 |
+
results.append({
|
| 144 |
+
'filename': res.get('filename', 'unknown'),
|
| 145 |
+
'success': False,
|
| 146 |
+
'summary': None,
|
| 147 |
+
'error': res.get('error', 'Extraction failed or no text/chunks'),
|
| 148 |
+
'processing_time': 0, # No summarization time for skipped files
|
| 149 |
+
})
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def process_single_summary(result: Dict[str, Any]) -> Dict[str, Any]:
|
| 153 |
+
"""Helper function to summarize a single document result."""
|
| 154 |
+
file_start_time = time.time()
|
| 155 |
+
filename = result.get('filename', 'unknown')
|
| 156 |
+
# Use detected language, default to English if detection failed
|
| 157 |
+
language = result.get('language', 'en')
|
| 158 |
+
chunk_size = result.get('chunk_size', 0) # Should be > 0 for summarizable_results
|
| 159 |
+
|
| 160 |
+
logger.info(f"Summarizing document: {filename}")
|
| 161 |
+
|
| 162 |
+
try:
|
| 163 |
+
# Call the summarizer for a single document
|
| 164 |
+
# The summerize_document method handles parallel processing of components internally
|
| 165 |
+
summary = summarizer.summerize_document(filename, language, chunk_size)
|
| 166 |
+
|
| 167 |
+
file_end_time = time.time()
|
| 168 |
+
logger.info(f"Finished summarizing {filename} in {file_end_time - file_start_time:.2f} seconds.")
|
| 169 |
+
return {
|
| 170 |
+
'filename': filename,
|
| 171 |
+
'success': True,
|
| 172 |
+
'summary': summary, # Return the summary string
|
| 173 |
+
'error': None,
|
| 174 |
+
'processing_time': file_end_time - file_start_time,
|
| 175 |
+
}
|
| 176 |
+
except Exception as e:
|
| 177 |
+
file_end_time = time.time()
|
| 178 |
+
error_msg = str(e)
|
| 179 |
+
logger.error(f"Error summarizing document {filename}: {e}", exc_info=True)
|
| 180 |
+
return {
|
| 181 |
+
'filename': filename,
|
| 182 |
+
'success': False,
|
| 183 |
+
'summary': None,
|
| 184 |
+
'error': error_msg,
|
| 185 |
+
'processing_time': file_end_time - file_start_time,
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
with ThreadPoolExecutor(max_workers=None) as executor: # Adjust max_workers as needed
|
| 189 |
+
# Submit summarizable document results to the executor
|
| 190 |
+
futures = {executor.submit(process_single_summary, res): res['filename'] for res in summarizable_results}
|
| 191 |
+
|
| 192 |
+
# Process results as they complete
|
| 193 |
+
for future in as_completed(futures):
|
| 194 |
+
filename = futures[future]
|
| 195 |
+
try:
|
| 196 |
+
summary_result = future.result()
|
| 197 |
+
results.append(summary_result)
|
| 198 |
+
logger.debug(f"Summary result received for {filename}")
|
| 199 |
+
except Exception as exc:
|
| 200 |
+
# This catches exceptions *within* the future's result retrieval
|
| 201 |
+
logger.error(f"Exception retrieving summary result for {filename}: {exc}", exc_info=True)
|
| 202 |
+
results.append({
|
| 203 |
+
'filename': filename,
|
| 204 |
+
'success': False,
|
| 205 |
+
'summary': None,
|
| 206 |
+
'error': f"Failed to retrieve result: {exc}",
|
| 207 |
+
'processing_time': 0, # Can't determine processing time if result retrieval failed
|
| 208 |
+
})
|
| 209 |
+
|
| 210 |
+
end_time = time.time()
|
| 211 |
+
logger.info(f"Finished batch summarization in {end_time - start_time:.2f} seconds.")
|
| 212 |
+
return results
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
# if __name__ == "__main__":
|
| 216 |
+
# start_time = time.time()
|
| 217 |
+
# logger.info("Starting document summarization process (command line).")
|
| 218 |
+
#
|
| 219 |
+
# try:
|
| 220 |
+
# # Step 1: Process documents from the predefined folder
|
| 221 |
+
# logger.info(f"Processing documents from: {DOCS_FOLDER}")
|
| 222 |
+
# # DocumentProcessorAdapter().process_folder returns a list of extraction result dicts
|
| 223 |
+
# extraction_results = DocumentProcessorAdapter().process_folder(DOCS_FOLDER)
|
| 224 |
+
# logger.info(f"Document Processing Time taken: {time.time()-start_time:.2f} seconds")
|
| 225 |
+
#
|
| 226 |
+
# # Step 2: Setup retrieval system
|
| 227 |
+
# setup_start_time = time.time()
|
| 228 |
+
# # setup_retrieval_system takes extraction results and returns updated results (with chunk_size) and the retriever
|
| 229 |
+
# extraction_results_with_chunks, retriever = setup_retrieval_system(extraction_results)
|
| 230 |
+
# logger.info(f"Retriever Setup Time taken: {time.time() - setup_start_time:.2f} seconds")
|
| 231 |
+
#
|
| 232 |
+
# # Step 3: Summarize the documents
|
| 233 |
+
# summarization_start_time = time.time()
|
| 234 |
+
# # For command line, we might still want to save files locally
|
| 235 |
+
# output_manager = SummaryOutputManager() # Uses default output_dir from settings
|
| 236 |
+
# # summarize_extracted_documents performs the summarization and returns results
|
| 237 |
+
# summary_results = summarize_extracted_documents(extraction_results_with_chunks, retriever)
|
| 238 |
+
#
|
| 239 |
+
# # Step 4: Save summaries to files (for command-line only)
|
| 240 |
+
# logger.info("Saving summaries to files.")
|
| 241 |
+
# saved_count = 0
|
| 242 |
+
# for res in summary_results:
|
| 243 |
+
# if res['success'] and res['summary']:
|
| 244 |
+
# # Use the output_manager to save the summary string
|
| 245 |
+
# output_manager.save_summary(res['filename'], res['summary'], formats=['markdown'])
|
| 246 |
+
# saved_count += 1
|
| 247 |
+
# logger.info(f"Saved {saved_count} summaries.")
|
| 248 |
+
#
|
| 249 |
+
#
|
| 250 |
+
# logger.info(f"Summarization Time taken: {time.time() - summarization_start_time:.2f} seconds")
|
| 251 |
+
#
|
| 252 |
+
#
|
| 253 |
+
# # Output results summary to console
|
| 254 |
+
# logger.info("\n" + "=" * 50)
|
| 255 |
+
# logger.info("Summarization Process Complete.")
|
| 256 |
+
# logger.info("=" * 50)
|
| 257 |
+
# successful_count = sum(res.get('success', False) for res in summary_results)
|
| 258 |
+
# total_processed = len(summary_results) # Includes skipped files if they were added to results list earlier
|
| 259 |
+
# total_time = time.time() - start_time
|
| 260 |
+
#
|
| 261 |
+
# logger.info(f"Total files attempted: {len(extraction_results)}") # Total files found/attempted extraction
|
| 262 |
+
# logger.info(f"Files successfully extracted and summarizable: {len(extraction_results_with_chunks)}") # Files with text and chunks
|
| 263 |
+
# logger.info(f"Files summarized: {successful_count}/{total_processed}")
|
| 264 |
+
# logger.info(f"Total process time: {total_time:.2f} seconds")
|
| 265 |
+
# logger.info("=" * 50)
|
| 266 |
+
#
|
| 267 |
+
# # Print individual results status
|
| 268 |
+
# logger.info("\nIndividual File Results:")
|
| 269 |
+
# for result in summary_results:
|
| 270 |
+
# name = result.get('filename', 'unknown')
|
| 271 |
+
# status = "SUCCESS" if result['success'] else "FAILED"
|
| 272 |
+
# time_taken = result.get('processing_time', 0)
|
| 273 |
+
# error_msg = result.get('error', '')
|
| 274 |
+
# logger.info(f"- {name}: {status} ({time_taken:.2f}s) {f'Error: {error_msg}' if error_msg else ''}")
|
| 275 |
+
#
|
| 276 |
+
#
|
| 277 |
+
# except FileNotFoundError as fnf_error:
|
| 278 |
+
# logger.error(f"Configuration Error: {fnf_error}")
|
| 279 |
+
# print(f"Error: {fnf_error}")
|
| 280 |
+
# except Exception as main_error:
|
| 281 |
+
# logger.error(f"An unexpected error occurred during the main process: {main_error}", exc_info=True)
|
| 282 |
+
# print(f"An unexpected error occurred: {main_error}")
|
packages.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tesseract-ocr
|
| 2 |
+
libtesseract-dev
|
| 3 |
+
tesseract-ocr-hin
|
| 4 |
+
tesseract-ocr-ara
|
| 5 |
+
tesseract-ocr-spa
|
| 6 |
+
tesseract-ocr-chi-sim
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
dotenv
|
| 3 |
+
cohere
|
| 4 |
+
langchain
|
| 5 |
+
chromadb
|
| 6 |
+
protobuf~=3.20
|
| 7 |
+
langchain-chroma
|
| 8 |
+
langchain-cohere
|
| 9 |
+
pypdf
|
| 10 |
+
pytesseract
|
| 11 |
+
Pillow
|