Ronochieng commited on
Commit
8e3278d
·
1 Parent(s): df9d5f3

Feat: DocMindAI init

Browse files
Files changed (5) hide show
  1. .env +1 -0
  2. Ingestion/__init__.py +0 -0
  3. Ingestion/ingest.py +250 -0
  4. app.py +1146 -0
  5. requirements.txt +19 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=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
Ingestion/__init__.py ADDED
File without changes
Ingestion/ingest.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from typing import Any, Optional
4
+
5
+ # Import Langchain document loaders
6
+ from langchain_community.document_loaders import (
7
+ PyPDFLoader,
8
+ UnstructuredWordDocumentLoader,
9
+ UnstructuredPowerPointLoader,
10
+ UnstructuredExcelLoader,
11
+ UnstructuredMarkdownLoader,
12
+ UnstructuredHTMLLoader,
13
+ UnstructuredXMLLoader,
14
+ UnstructuredEmailLoader,
15
+ UnstructuredFileLoader,
16
+ UnstructuredEPubLoader,
17
+ CSVLoader,
18
+ TextLoader
19
+ )
20
+
21
+ def get_processor_for_file(file_path: str) -> Optional[callable]:
22
+ """
23
+ Determine the appropriate processor function for the given file type
24
+ """
25
+ file_extension = os.path.splitext(file_path)[1].lower()
26
+
27
+ # Map file extensions to specific processor functions
28
+ processors = {
29
+ ".pdf": process_pdf,
30
+ ".docx": process_docx,
31
+ ".doc": process_docx,
32
+ ".pptx": process_pptx,
33
+ ".ppt": process_pptx,
34
+ ".xlsx": process_xlsx,
35
+ ".xls": process_xlsx,
36
+ ".md": process_markdown,
37
+ ".html": process_html,
38
+ ".htm": process_html,
39
+ ".xml": process_xml,
40
+ ".msg": process_email,
41
+ ".eml": process_email,
42
+ ".epub": process_epub,
43
+ ".txt": process_text,
44
+ ".csv": process_csv,
45
+ ".rtf": process_text,
46
+
47
+ # Code files
48
+ ".py": process_text,
49
+ ".js": process_text,
50
+ ".java": process_text,
51
+ ".ts": process_text,
52
+ ".tsx": process_text,
53
+ ".jsx": process_text,
54
+ ".c": process_text,
55
+ ".cpp": process_text,
56
+ ".h": process_text,
57
+ ".cs": process_text,
58
+ ".rb": process_text,
59
+ ".go": process_text,
60
+ ".rs": process_text,
61
+ ".php": process_text,
62
+ ".sql": process_text,
63
+ ".css": process_text,
64
+ }
65
+
66
+ return processors.get(file_extension, process_generic)
67
+
68
+ def process_document(file_path: str) -> Optional[str]:
69
+ """
70
+ Process a document using the appropriate processor based on file type
71
+ """
72
+ processor = get_processor_for_file(file_path)
73
+ if processor:
74
+ return processor(file_path)
75
+ return None
76
+
77
+ def process_pdf(file_path: str) -> str:
78
+ """
79
+ Process PDF documents using Langchain's PyPDFLoader
80
+ """
81
+ loader = PyPDFLoader(file_path)
82
+ docs = loader.load()
83
+
84
+ texts = [doc.page_content for doc in docs if doc.page_content]
85
+ combined_text = "\n\n".join(texts)
86
+
87
+ return combined_text
88
+
89
+ def process_docx(file_path: str) -> str:
90
+ """
91
+ Process DOCX documents using Langchain's UnstructuredWordDocumentLoader
92
+ """
93
+ loader = UnstructuredWordDocumentLoader(file_path)
94
+ docs = loader.load()
95
+
96
+ texts = [doc.page_content for doc in docs if doc.page_content]
97
+ combined_text = "\n\n".join(texts)
98
+
99
+ return combined_text
100
+
101
+ def process_pptx(file_path: str) -> str:
102
+ """
103
+ Process PPTX documents using Langchain's UnstructuredPowerPointLoader
104
+ """
105
+ loader = UnstructuredPowerPointLoader(file_path)
106
+ docs = loader.load()
107
+
108
+ texts = [doc.page_content for doc in docs if doc.page_content]
109
+ combined_text = "\n\n".join(texts)
110
+
111
+ return combined_text
112
+
113
+ def process_xlsx(file_path: str) -> str:
114
+ """
115
+ Process XLSX documents using Langchain's UnstructuredExcelLoader
116
+ """
117
+ loader = UnstructuredExcelLoader(file_path)
118
+ docs = loader.load()
119
+
120
+ texts = [doc.page_content for doc in docs if doc.page_content]
121
+ combined_text = "\n\n".join(texts)
122
+
123
+ return combined_text
124
+
125
+ def process_markdown(file_path: str) -> str:
126
+ """
127
+ Process Markdown documents using Langchain's UnstructuredMarkdownLoader
128
+ """
129
+ loader = UnstructuredMarkdownLoader(file_path)
130
+ docs = loader.load()
131
+
132
+ texts = [doc.page_content for doc in docs if doc.page_content]
133
+ combined_text = "\n\n".join(texts)
134
+
135
+ return combined_text
136
+
137
+ def process_html(file_path: str) -> str:
138
+ """
139
+ Process HTML documents using Langchain's UnstructuredHTMLLoader
140
+ """
141
+ loader = UnstructuredHTMLLoader(file_path)
142
+ docs = loader.load()
143
+
144
+ texts = [doc.page_content for doc in docs if doc.page_content]
145
+ combined_text = "\n\n".join(texts)
146
+
147
+ return combined_text
148
+
149
+ def process_xml(file_path: str) -> str:
150
+ """
151
+ Process XML documents using Langchain's UnstructuredXMLLoader
152
+ """
153
+ loader = UnstructuredXMLLoader(file_path)
154
+ docs = loader.load()
155
+
156
+ texts = [doc.page_content for doc in docs if doc.page_content]
157
+ combined_text = "\n\n".join(texts)
158
+
159
+ return combined_text
160
+
161
+ def process_email(file_path: str) -> str:
162
+ """
163
+ Process email documents using Langchain's UnstructuredEmailLoader
164
+ """
165
+ loader = UnstructuredEmailLoader(file_path)
166
+ docs = loader.load()
167
+
168
+ texts = [doc.page_content for doc in docs if doc.page_content]
169
+ combined_text = "\n\n".join(texts)
170
+
171
+ return combined_text
172
+
173
+ def process_text(file_path: str) -> str:
174
+ """
175
+ Process text documents using Langchain's TextLoader
176
+ """
177
+ loader = TextLoader(file_path, encoding="utf-8")
178
+ try:
179
+ docs = loader.load()
180
+
181
+ texts = [doc.page_content for doc in docs if doc.page_content]
182
+ combined_text = "\n\n".join(texts)
183
+
184
+ return combined_text
185
+ except UnicodeDecodeError:
186
+ # Try with a different encoding if utf-8 fails
187
+ loader = TextLoader(file_path, encoding="latin-1")
188
+ docs = loader.load()
189
+
190
+ texts = [doc.page_content for doc in docs if doc.page_content]
191
+ combined_text = "\n\n".join(texts)
192
+
193
+ return combined_text
194
+
195
+ def process_csv(file_path: str) -> str:
196
+ """
197
+ Process CSV documents using Langchain's CSVLoader
198
+ """
199
+ loader = CSVLoader(file_path)
200
+ docs = loader.load()
201
+
202
+ # Create a formatted string representation of the CSV data
203
+ rows = []
204
+ if docs:
205
+ # Get column names from metadata if available
206
+ if hasattr(docs[0], 'metadata') and 'columns' in docs[0].metadata:
207
+ rows.append(",".join(docs[0].metadata['columns']))
208
+
209
+ # Add content rows
210
+ for doc in docs:
211
+ rows.append(doc.page_content)
212
+
213
+ return "\n".join(rows)
214
+
215
+ def process_epub(file_path: str) -> str:
216
+ """
217
+ Process EPUB documents using Langchain's UnstructuredEPubLoader
218
+ """
219
+ loader = UnstructuredEPubLoader(file_path)
220
+ docs = loader.load()
221
+
222
+ texts = [doc.page_content for doc in docs if doc.page_content]
223
+ combined_text = "\n\n".join(texts)
224
+
225
+ return combined_text
226
+
227
+ def process_generic(file_path: str) -> str:
228
+ """
229
+ Generic document processor using Langchain's UnstructuredFileLoader
230
+ """
231
+ try:
232
+ loader = UnstructuredFileLoader(file_path)
233
+ docs = loader.load()
234
+
235
+ texts = [doc.page_content for doc in docs if doc.page_content]
236
+ combined_text = "\n\n".join(texts)
237
+
238
+ return combined_text
239
+ except Exception as e:
240
+ # Fall back to basic text processing if UnstructuredFileLoader fails
241
+ try:
242
+ with open(file_path, 'r', encoding='utf-8') as f:
243
+ return f.read()
244
+ except Exception:
245
+ # Try with a different encoding if utf-8 fails
246
+ try:
247
+ with open(file_path, 'r', encoding='latin-1') as f:
248
+ return f.read()
249
+ except Exception as e2:
250
+ raise Exception(f"Could not process file: {str(e)} / {str(e2)}")
app.py ADDED
@@ -0,0 +1,1146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import sys
3
+ import hashlib
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import os
7
+ import tempfile
8
+ from typing import List, Optional, Dict, Any, Union
9
+ import json
10
+ import openai
11
+ from datetime import datetime
12
+ from langchain.output_parsers import PydanticOutputParser
13
+ from langchain.prompts import ChatPromptTemplate
14
+ from langchain.schema import HumanMessage, SystemMessage
15
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
16
+ from langchain.schema.runnable import RunnablePassthrough
17
+ from langchain.prompts.prompt import PromptTemplate
18
+ from langchain.memory import ConversationBufferMemory
19
+ from langchain_community.vectorstores import Chroma
20
+ from pydantic import BaseModel, Field
21
+ from Ingestion.ingest import process_document, get_processor_for_file
22
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
23
+
24
+ import warnings
25
+ warnings.filterwarnings("ignore", category=RuntimeWarning)
26
+
27
+
28
+ sys.path.append("../..")
29
+ from dotenv import load_dotenv, find_dotenv
30
+
31
+ _ = load_dotenv(find_dotenv())
32
+
33
+
34
+ openai.api_key = os.environ["OPENAI_API_KEY"]
35
+
36
+
37
+ # Set event loop policy for Windows if needed
38
+ if sys.platform == "win32" and sys.version_info >= (3, 8):
39
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
40
+
41
+ # Set page configuration
42
+ st.set_page_config(
43
+ page_title="DocMind AI: AI-Powered Document Analysis",
44
+ page_icon="🧠",
45
+ layout="wide",
46
+ initial_sidebar_state="expanded",
47
+ )
48
+
49
+ # Custom CSS for better dark/light mode compatibility
50
+ st.markdown("""
51
+ <style>
52
+ /* Common styles for both modes */
53
+ .stApp {
54
+ max-width: 1200px;
55
+ margin: 0 auto;
56
+ }
57
+
58
+ /* Card styling for results */
59
+ .card {
60
+ border-radius: 5px;
61
+ padding: 1.5rem;
62
+ margin-bottom: 1rem;
63
+ border: 1px solid rgba(128, 128, 128, 0.2);
64
+ }
65
+
66
+ /* Dark mode specific */
67
+ @media (prefers-color-scheme: dark) {
68
+ .card {
69
+ background-color: rgba(255, 255, 255, 0.05);
70
+ }
71
+
72
+ .highlight-container {
73
+ background-color: rgba(255, 255, 255, 0.05);
74
+ border-left: 3px solid #4CAF50;
75
+ }
76
+
77
+ .chat-user {
78
+ background-color: rgba(0, 0, 0, 0.2);
79
+ }
80
+
81
+ .chat-ai {
82
+ background-color: rgba(76, 175, 80, 0.1);
83
+ }
84
+ }
85
+
86
+ /* Light mode specific */
87
+ @media (prefers-color-scheme: light) {
88
+ .card {
89
+ background-color: rgba(0, 0, 0, 0.02);
90
+ }
91
+
92
+ .highlight-container {
93
+ background-color: rgba(0, 0, 0, 0.03);
94
+ border-left: 3px solid #4CAF50;
95
+ }
96
+
97
+ .chat-user {
98
+ background-color: rgba(240, 240, 240, 0.7);
99
+ }
100
+
101
+ .chat-ai {
102
+ background-color: rgba(76, 175, 80, 0.05);
103
+ }
104
+ }
105
+
106
+ /* Chat message styling */
107
+ .chat-container {
108
+ margin-bottom: 1rem;
109
+ }
110
+
111
+ .chat-message {
112
+ padding: 1rem;
113
+ border-radius: 5px;
114
+ margin-bottom: 0.5rem;
115
+ }
116
+
117
+ /* Highlight sections */
118
+ .highlight-container {
119
+ padding: 1rem;
120
+ margin: 1rem 0;
121
+ border-radius: 4px;
122
+ }
123
+
124
+ /* Status indicators */
125
+ .status-success {
126
+ color: #4CAF50;
127
+ }
128
+
129
+ .status-error {
130
+ color: #F44336;
131
+ }
132
+
133
+ /* Document list */
134
+ .doc-list {
135
+ list-style-type: none;
136
+ padding-left: 0;
137
+ }
138
+
139
+ .doc-list li {
140
+ padding: 0.5rem 0;
141
+ border-bottom: 1px solid rgba(128, 128, 128, 0.2);
142
+ }
143
+
144
+ /* Document card */
145
+ .doc-card {
146
+ padding: 0.8rem;
147
+ border-radius: 4px;
148
+ border: 1px solid rgba(128, 128, 128, 0.2);
149
+ margin-bottom: 0.5rem;
150
+ cursor: pointer;
151
+ }
152
+
153
+ .doc-card:hover {
154
+ background-color: rgba(76, 175, 80, 0.1);
155
+ }
156
+
157
+ .doc-card.selected {
158
+ background-color: rgba(76, 175, 80, 0.2);
159
+ border-color: #4CAF50;
160
+ }
161
+ </style>
162
+ """, unsafe_allow_html=True)
163
+
164
+ # Define the output structures using Pydantic
165
+ class DocumentAnalysis(BaseModel):
166
+ summary: str = Field(description="A concise summary of the document")
167
+ key_insights: List[str] = Field(description="A list of key insights from the document")
168
+ action_items: Optional[List[str]] = Field(None, description="A list of action items derived from the document")
169
+ open_questions: Optional[List[str]] = Field(None, description="A list of open questions or areas needing clarification")
170
+
171
+ def hash_file(file_content):
172
+ """Generate SHA-256 hash of file content to check for duplicates"""
173
+ return hashlib.sha256(file_content).hexdigest()
174
+
175
+ class DocumentStore:
176
+ def __init__(self, storage_dir="document_store"):
177
+ self.storage_dir = storage_dir
178
+ os.makedirs(storage_dir, exist_ok=True)
179
+ self.metadata_path = os.path.join(storage_dir, "metadata.json")
180
+ self.analysis_path = os.path.join(storage_dir, "analysis_results.json")
181
+ self.load_metadata()
182
+ self.load_analysis_results()
183
+
184
+ def load_metadata(self):
185
+ if os.path.exists(self.metadata_path):
186
+ with open(self.metadata_path, 'r') as f:
187
+ self.metadata = json.load(f)
188
+ else:
189
+ self.metadata = {}
190
+
191
+ def load_analysis_results(self):
192
+ if os.path.exists(self.analysis_path):
193
+ with open(self.analysis_path, 'r') as f:
194
+ self.analysis_results = json.load(f)
195
+ else:
196
+ self.analysis_results = {}
197
+
198
+ def save_metadata(self):
199
+ with open(self.metadata_path, 'w') as f:
200
+ json.dump(self.metadata, f)
201
+
202
+ def save_analysis_results(self):
203
+ with open(self.analysis_path, 'w') as f:
204
+ json.dump(self.analysis_results, f)
205
+
206
+ def get_all_documents(self):
207
+ """Return all documents in the store"""
208
+ return self.metadata
209
+
210
+ def file_exists(self, file_hash):
211
+ """Check if a file with the given hash exists in the store"""
212
+ return file_hash in self.metadata
213
+
214
+ def get_document_path(self, file_hash):
215
+ """Get the file path for a document with the given hash"""
216
+ if file_hash in self.metadata:
217
+ return os.path.join(self.storage_dir, file_hash)
218
+ return None
219
+
220
+ def add_document(self, file, file_hash):
221
+ """Add a new document to the store"""
222
+ # Save the file to disk
223
+ file_path = os.path.join(self.storage_dir, file_hash)
224
+ with open(file_path, 'wb') as f:
225
+ f.write(file.getbuffer())
226
+
227
+ # Add metadata
228
+ self.metadata[file_hash] = {
229
+ "filename": file.name,
230
+ "upload_date": datetime.now().isoformat(),
231
+ "size": len(file.getbuffer())
232
+ }
233
+ self.save_metadata()
234
+
235
+ # Add method to store analysis results
236
+ def add_analysis_result(self, doc_hash, analysis_result):
237
+ """Store analysis result for a document"""
238
+ if doc_hash not in self.analysis_results:
239
+ self.analysis_results[doc_hash] = {}
240
+
241
+ # Store with timestamp
242
+ self.analysis_results[doc_hash] = {
243
+ "result": analysis_result,
244
+ "timestamp": datetime.now().isoformat()
245
+ }
246
+ self.save_analysis_results()
247
+
248
+ # Add method to store combined analysis results
249
+ def add_combined_analysis(self, doc_hashes, analysis_result):
250
+ """Store combined analysis result for multiple documents"""
251
+ session_id = "_".join(sorted(doc_hashes))
252
+
253
+ if "combined" not in self.analysis_results:
254
+ self.analysis_results["combined"] = {}
255
+
256
+ self.analysis_results["combined"][session_id] = {
257
+ "result": analysis_result,
258
+ "timestamp": datetime.now().isoformat(),
259
+ "doc_hashes": doc_hashes
260
+ }
261
+ self.save_analysis_results()
262
+
263
+ # Check if analysis exists for a document
264
+ def has_analysis(self, doc_hash):
265
+ return doc_hash in self.analysis_results
266
+
267
+ # Check if combined analysis exists for a set of documents
268
+ def has_combined_analysis(self, doc_hashes):
269
+ if "combined" not in self.analysis_results:
270
+ return False
271
+
272
+ session_id = "_".join(sorted(doc_hashes))
273
+ return session_id in self.analysis_results["combined"]
274
+
275
+ # Get analysis result for a document
276
+ def get_analysis(self, doc_hash):
277
+ return self.analysis_results.get(doc_hash, {}).get("result")
278
+
279
+ # Get combined analysis result for multiple documents
280
+ def get_combined_analysis(self, doc_hashes):
281
+ if "combined" not in self.analysis_results:
282
+ return None
283
+
284
+ session_id = "_".join(sorted(doc_hashes))
285
+ return self.analysis_results["combined"].get(session_id, {}).get("result")
286
+
287
+
288
+ # Function to clean up LLM responses for better parsing
289
+ def clean_llm_response(response):
290
+ """Clean up the LLM response to extract JSON content from potential markdown code blocks."""
291
+ # Extract content from the response
292
+ if isinstance(response, dict) and 'choices' in response:
293
+ content = response['choices'][0]['message']['content']
294
+ else:
295
+ content = str(response)
296
+
297
+ # Remove markdown code block formatting if present
298
+ if '```' in content:
299
+ # Handle ```json format
300
+ parts = content.split('```')
301
+ if len(parts) >= 3: # Has opening and closing backticks
302
+ # Take the content between first pair of backticks
303
+ content = parts[1]
304
+ # Remove json language specifier if present
305
+ if content.startswith('json') or content.startswith('JSON'):
306
+ content = content[4:].lstrip()
307
+ elif '`json' in content:
308
+ # Handle `json format
309
+ parts = content.split('`json')
310
+ if len(parts) >= 2:
311
+ content = parts[1]
312
+ if '`' in content:
313
+ content = content.split('`')[0]
314
+
315
+ # Strip any leading/trailing whitespace
316
+ content = content.strip()
317
+
318
+ # Try to parse as JSON
319
+ try:
320
+ json_data = json.loads(content)
321
+
322
+ # Check if result is nested under "properties" key
323
+ if isinstance(json_data, dict) and "properties" in json_data:
324
+ # Extract the properties content
325
+ return json.dumps(json_data["properties"])
326
+
327
+ return content
328
+ except:
329
+ # If JSON parsing fails, return the original content
330
+ return content
331
+
332
+ # Initialize LLM without widgets in the cached function
333
+ @st.cache_resource(show_spinner="Loading Model...")
334
+ def load_model():
335
+ """Loads the language model."""
336
+ try:
337
+ llm = ChatOpenAI(temperature=0.1, model_name="gpt-4o-mini")
338
+ return llm
339
+ except Exception as e:
340
+ st.error(f"Error loading Gemini model: {e}")
341
+ return None
342
+
343
+ # Initialize embeddings without widgets in the cached function
344
+ @st.cache_resource(show_spinner=False)
345
+ def load_embeddings():
346
+ """Load embeddings model"""
347
+ try:
348
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
349
+ return embeddings
350
+ except Exception as e:
351
+ st.error(f"Error loading embeddings model: {e}")
352
+ return None
353
+
354
+ # Initialize session state variables
355
+ if 'model_loaded' not in st.session_state:
356
+ st.session_state['model_loaded'] = False
357
+ if 'embeddings_loaded' not in st.session_state:
358
+ st.session_state['embeddings_loaded'] = False
359
+ if 'document_store' not in st.session_state:
360
+ st.session_state['document_store'] = DocumentStore()
361
+ if 'chat_sessions' not in st.session_state:
362
+ st.session_state['chat_sessions'] = {}
363
+ if 'session_history' not in st.session_state:
364
+ st.session_state['session_history'] = {}
365
+ if 'selected_docs' not in st.session_state:
366
+ st.session_state['selected_docs'] = []
367
+ if 'analyzed_docs' not in st.session_state:
368
+ st.session_state['analyzed_docs'] = set()
369
+ if 'analyzed_combinations' not in st.session_state:
370
+ st.session_state['analyzed_combinations'] = set()
371
+ if 'active_tab' not in st.session_state:
372
+ st.session_state['active_tab'] = "Upload & Manage Documents"
373
+
374
+ # Sidebar Configuration with improved styling
375
+ st.sidebar.markdown("<div style='text-align: center;'><h1>🧠 DocMind AI</h1></div>", unsafe_allow_html=True)
376
+ st.sidebar.markdown("<div style='text-align: center;'>AI-Powered Document Analysis</div>", unsafe_allow_html=True)
377
+ st.sidebar.markdown("---")
378
+
379
+ # Load LLM - Only show loading spinner once
380
+ with st.sidebar:
381
+ if not st.session_state.get('model_loaded', False):
382
+ llm = load_model()
383
+ if llm:
384
+ st.session_state['model_loaded'] = True
385
+ else:
386
+ st.session_state['model_loaded'] = False
387
+ else:
388
+ llm = load_model() # Will use cached version
389
+
390
+ if st.session_state.get('model_loaded'):
391
+ st.markdown("<div class='status-success'>✅ Model loaded successfully!</div>", unsafe_allow_html=True)
392
+ else:
393
+ st.markdown("<div class='status-error'>❌ Error loading model.</div>", unsafe_allow_html=True)
394
+ st.stop()
395
+
396
+ # Load embeddings - Only show loading spinner once
397
+ with st.sidebar:
398
+ if not st.session_state['embeddings_loaded']:
399
+ with st.spinner("Loading embeddings..."):
400
+ embeddings = load_embeddings()
401
+ if embeddings:
402
+ st.session_state['embeddings_loaded'] = True
403
+ else:
404
+ st.session_state['embeddings_loaded'] = False
405
+ else:
406
+ embeddings = load_embeddings() # Will use cached version
407
+
408
+ if st.session_state.get('embeddings_loaded'):
409
+ st.markdown("<div class='status-success'>✅ Embeddings loaded successfully!</div>", unsafe_allow_html=True)
410
+ else:
411
+ st.markdown("<div class='status-error'>❌ Error loading embeddings.</div>", unsafe_allow_html=True)
412
+ st.stop()
413
+
414
+ # Create a unique session ID for a document set
415
+ def get_session_id(doc_hashes):
416
+ return "_".join(sorted(doc_hashes))
417
+
418
+ # Process documents using the document store
419
+ def process_documents(file_hashes):
420
+ processed_docs = []
421
+ doc_store = st.session_state['document_store']
422
+
423
+ # Create a progress bar
424
+ progress_bar = st.progress(0)
425
+
426
+ # Use ThreadPoolExecutor for parallel processing
427
+ from concurrent.futures import ThreadPoolExecutor, as_completed
428
+
429
+ def process_single_document(file_hash, index, total):
430
+ try:
431
+ file_path = doc_store.get_document_path(file_hash)
432
+ file_name = doc_store.metadata[file_hash]["filename"]
433
+
434
+ if file_path and os.path.exists(file_path):
435
+ processor = get_processor_for_file(file_path)
436
+ if processor:
437
+ # Process in chunks for large files
438
+ doc_data = process_document_in_chunks(file_path, processor)
439
+ if doc_data is not None and len(doc_data.strip()) > 0:
440
+ processed_docs.append({"name": file_name, "data": doc_data, "hash": file_hash})
441
+
442
+ # Update progress
443
+ progress_bar.progress((index + 1) / total)
444
+ return True
445
+ except Exception as e:
446
+ st.error(f"Error processing {file_name}: {str(e)}")
447
+ return False
448
+
449
+ # Process documents in parallel
450
+ total_docs = len(file_hashes)
451
+ with ThreadPoolExecutor(max_workers=min(4, total_docs)) as executor:
452
+ futures = {executor.submit(process_single_document, fh, i, total_docs): fh
453
+ for i, fh in enumerate(file_hashes)}
454
+
455
+ for future in as_completed(futures):
456
+ _ = future.result()
457
+
458
+ return processed_docs
459
+
460
+ def process_document_in_chunks(file_path, processor, chunk_size=5*1024*1024):
461
+ """Process large documents in chunks to avoid memory issues"""
462
+ file_size = os.path.getsize(file_path)
463
+
464
+ if file_size <= chunk_size:
465
+ # For small files, process normally
466
+ return processor(file_path)
467
+
468
+ # For large files, especially PDFs, use a chunked approach
469
+ file_ext = os.path.splitext(file_path)[1].lower()
470
+ if file_ext == ".pdf":
471
+ # For PDFs, process page by page
472
+ return process_pdf_by_page(file_path)
473
+ else:
474
+ # For other large files, try to process normally but with timeout
475
+ try:
476
+ import signal
477
+
478
+ class TimeoutException(Exception): pass
479
+
480
+ def timeout_handler(signum, frame):
481
+ raise TimeoutException("Processing timed out")
482
+
483
+ # Set timeout of 30 seconds
484
+ signal.signal(signal.SIGALRM, timeout_handler)
485
+ signal.alarm(30)
486
+
487
+ try:
488
+ result = processor(file_path)
489
+ signal.alarm(0) # Cancel the alarm
490
+ return result
491
+ except TimeoutException:
492
+ # If timeout occurs, fall back to basic text extraction
493
+ return basic_text_extraction(file_path)
494
+ except:
495
+ # If signal handling is not available (e.g., on Windows)
496
+ return processor(file_path)
497
+
498
+ # Function to set up document chat
499
+ def setup_document_chat(processed_docs):
500
+ doc_hashes = [doc['hash'] for doc in processed_docs]
501
+ session_id = get_session_id(doc_hashes)
502
+
503
+ with st.spinner("Setting up document chat..."):
504
+ try:
505
+ # Optimize text splitting parameters for better performance
506
+ text_splitter = RecursiveCharacterTextSplitter(
507
+ chunk_size=1500, # Larger chunks to reduce the number of embeddings
508
+ chunk_overlap=150,
509
+ length_function=len
510
+ )
511
+
512
+ # Use a more efficient approach to create chunks
513
+ all_chunks = []
514
+ for doc in processed_docs:
515
+ if not doc['data'] or len(doc['data'].strip()) == 0:
516
+ continue
517
+
518
+ # Split the document into chunks
519
+ chunks = text_splitter.split_text(doc['data'])
520
+
521
+ # Add document source to each chunk but only process if chunks aren't empty
522
+ if chunks:
523
+ # Add document source as metadata rather than in the text to save on tokens
524
+ chunks = [f"Source: {doc['name']}\n\n{chunk}" for chunk in chunks]
525
+ all_chunks.extend(chunks)
526
+
527
+ # If we have chunks, create the vector store
528
+ if all_chunks:
529
+ # Create a unique collection name based on document hashes
530
+ collection_name = f"docmind_{session_id}"
531
+
532
+ # Use batch processing for embeddings to improve performance
533
+ vectorstore = Chroma.from_texts(
534
+ texts=all_chunks,
535
+ embedding=embeddings,
536
+ collection_name=collection_name,
537
+ collection_metadata={"timestamp": datetime.now().isoformat()}
538
+ )
539
+
540
+ # Configure retriever for better performance
541
+ retriever = vectorstore.as_retriever(
542
+ search_kwargs={
543
+ "k": 5, # Retrieve top 5 chunks
544
+ "fetch_k": 20 # Consider top 20 before selecting top 5 (for MMR)
545
+ }
546
+ )
547
+
548
+ # Create a more efficient QA function
549
+ def document_qa(query):
550
+ # Get relevant documents
551
+ docs = retriever.get_relevant_documents(query)
552
+
553
+ # Extract text from documents with source highlighting
554
+ context = "\n\n".join([doc.page_content for doc in docs])
555
+
556
+ # Optimize prompt for the model
557
+ system_template = """You are DocMind AI, a helpful assistant that answers questions about documents.
558
+ Use the following pieces of retrieved context to answer the user's question.
559
+ If the answer isn't in the context, just say you don't know.
560
+ Include the source document name when providing information.
561
+
562
+ Context:
563
+ {context}
564
+ """
565
+
566
+ # Combine context and query
567
+ template = ChatPromptTemplate.from_messages([
568
+ ("system", system_template),
569
+ ("human", "{question}")
570
+ ])
571
+
572
+ # Process with model
573
+ response = template.invoke({
574
+ "context": context,
575
+ "question": query
576
+ }) | llm
577
+
578
+ return {"answer": response}
579
+
580
+ # Store the QA function in session state
581
+ st.session_state['chat_sessions'][session_id] = document_qa
582
+
583
+ # Initialize chat history
584
+ if session_id not in st.session_state['session_history']:
585
+ st.session_state['session_history'][session_id] = []
586
+
587
+ return session_id
588
+ else:
589
+ st.warning("No text chunks were created from the documents. Chat functionality is unavailable.")
590
+ return None
591
+
592
+ except Exception as e:
593
+ st.error(f"Error setting up document chat: {str(e)}")
594
+ return None
595
+
596
+ # Main content
597
+ # Get the tab options
598
+ tab_options = ["Upload & Manage Documents", "Document Analysis", "Chat with Documents"]
599
+ tab_index = tab_options.index(st.session_state['active_tab'])
600
+
601
+ # Create the tabs with the active tab selected
602
+ tab1, tab2, tab3 = st.tabs(tab_options)
603
+ tabs = [tab1, tab2, tab3]
604
+ active_tab = tabs[tab_index]
605
+
606
+ # Tab 1: Document Upload and Management
607
+ with tab1:
608
+ st.header("Upload & Manage Documents")
609
+
610
+ # File Upload with deduplication
611
+ uploaded_files = st.file_uploader(
612
+ "Upload Documents",
613
+ accept_multiple_files=True,
614
+ type=["pdf", "docx", "txt", "xlsx", "md", "json", "xml", "rtf", "csv", "msg", "pptx", "odt", "epub",
615
+ "py", "js", "java", "ts", "tsx", "c", "cpp", "h", "html", "css", "sql", "rb", "go", "rs", "php"]
616
+ )
617
+
618
+ doc_store = st.session_state['document_store']
619
+ new_files = []
620
+ existing_files = []
621
+
622
+ if uploaded_files:
623
+ for file in uploaded_files:
624
+ # Generate hash for the file content
625
+ file_hash = hash_file(file.getbuffer())
626
+
627
+ # Check if file exists in our document store
628
+ if doc_store.file_exists(file_hash):
629
+ existing_files.append((file.name, file_hash))
630
+ else:
631
+ # Store the file
632
+ doc_store.add_document(file, file_hash)
633
+ new_files.append((file.name, file_hash))
634
+
635
+ # Display information about file upload status
636
+ col1, col2 = st.columns(2)
637
+
638
+ with col1:
639
+ if new_files:
640
+ st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
641
+ st.markdown("### New Documents Added")
642
+ for name, file_hash in new_files:
643
+ st.markdown(f"- ✅ {name}")
644
+ # Automatically add to selected docs
645
+ if file_hash not in st.session_state['selected_docs']:
646
+ st.session_state['selected_docs'].append(file_hash)
647
+ st.markdown("</div>", unsafe_allow_html=True)
648
+
649
+ with col2:
650
+ if existing_files:
651
+ st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
652
+ st.markdown("### Already Existing Documents")
653
+ for name, file_hash in existing_files:
654
+ st.markdown(f"- ℹ️ {name} (already in library)")
655
+ # Automatically add to selected docs
656
+ if file_hash not in st.session_state['selected_docs']:
657
+ st.session_state['selected_docs'].append(file_hash)
658
+ st.markdown("</div>", unsafe_allow_html=True)
659
+
660
+ # Display the document library
661
+ st.markdown("---")
662
+ st.header("Document Library")
663
+
664
+ available_docs = doc_store.get_all_documents()
665
+ if available_docs:
666
+ st.markdown("Select documents for analysis or chat:")
667
+
668
+ # Create a grid layout for document cards
669
+ cols = st.columns(3)
670
+ for i, (doc_hash, doc_info) in enumerate(available_docs.items()):
671
+ col_idx = i % 3
672
+ with cols[col_idx]:
673
+ is_selected = doc_hash in st.session_state['selected_docs']
674
+ is_analyzed = doc_hash in st.session_state['analyzed_docs']
675
+ card_class = "doc-card selected" if is_selected else "doc-card"
676
+ with st.container():
677
+ st.markdown(f"<div class='{card_class}'>", unsafe_allow_html=True)
678
+ analyzed_badge = "✅ " if is_analyzed else ""
679
+ st.markdown(f"**{analyzed_badge}{doc_info['filename']}**")
680
+ st.markdown(f"Uploaded: {doc_info['upload_date'][:10]}")
681
+ st.markdown(f"Size: {doc_info['size'] // 1024} KB")
682
+
683
+ if is_analyzed:
684
+ st.markdown("<span style='color:#4CAF50;font-size:0.8em;'>Analysis available</span>", unsafe_allow_html=True)
685
+
686
+ if st.button("Select" if not is_selected else "Deselect", key=f"btn_{doc_hash}"):
687
+ if is_selected:
688
+ st.session_state['selected_docs'].remove(doc_hash)
689
+ else:
690
+ st.session_state['selected_docs'].append(doc_hash)
691
+ st.rerun()
692
+
693
+ st.markdown("</div>", unsafe_allow_html=True)
694
+
695
+ # Show selected documents count
696
+ st.markdown("---")
697
+ if st.session_state['selected_docs']:
698
+ analyzed_count = sum(1 for doc_hash in st.session_state['selected_docs'] if doc_hash in st.session_state['analyzed_docs'])
699
+ total_selected = len(st.session_state['selected_docs'])
700
+
701
+ if analyzed_count > 0:
702
+ st.success(f"{total_selected} documents selected for analysis ({analyzed_count} already analyzed)")
703
+
704
+ # Add a button to jump directly to chat if all selected documents are analyzed
705
+ if analyzed_count == total_selected:
706
+ if st.button("Chat with selected documents"):
707
+ st.session_state['active_tab'] = "Chat with Documents"
708
+ st.rerun()
709
+ else:
710
+ st.success(f"{total_selected} documents selected for analysis")
711
+ else:
712
+ st.info("No documents selected. Please select documents for analysis.")
713
+ else:
714
+ st.info("No documents in the library. Please upload documents.")
715
+
716
+ # Tab 2: Document Analysis
717
+ with tab2:
718
+ st.header("Document Analysis")
719
+
720
+ # Mode Selection
721
+ st.subheader("Analysis Configuration")
722
+ analysis_mode = st.radio(
723
+ "Analysis Mode",
724
+ ["Analyze each document separately", "Combine analysis for all documents"]
725
+ )
726
+
727
+ # Prompt Selection
728
+ prompt_options = {
729
+ "Comprehensive Document Analysis": "Analyze the provided document comprehensively. Generate a summary, extract key insights, identify action items, and list open questions.",
730
+ "Extract Key Insights and Action Items": "Extract key insights and action items from the provided document.",
731
+ "Summarize and Identify Open Questions": "Summarize the provided document and identify any open questions that need clarification.",
732
+ "Custom Prompt": "Enter a custom prompt below:"
733
+ }
734
+
735
+ col1, col2 = st.columns(2)
736
+
737
+ with col1:
738
+ selected_prompt_option = st.selectbox("Select Prompt", list(prompt_options.keys()))
739
+ custom_prompt = ""
740
+ if selected_prompt_option == "Custom Prompt":
741
+ custom_prompt = st.text_area("Enter Custom Prompt", height=100)
742
+
743
+ # Tone Selection
744
+ tone_options = [
745
+ "Professional", "Academic", "Informal", "Creative", "Neutral",
746
+ "Direct", "Empathetic", "Humorous", "Authoritative", "Inquisitive"
747
+ ]
748
+
749
+ with col2:
750
+ selected_tone = st.selectbox("Select Tone", tone_options)
751
+ selected_length = st.selectbox(
752
+ "Select Response Format",
753
+ ["Concise", "Detailed", "Comprehensive", "Bullet Points"]
754
+ )
755
+
756
+ # Instructions Selection
757
+ instruction_options = {
758
+ "General Assistant": "Act as a helpful assistant.",
759
+ "Researcher": "Act as a researcher providing in-depth analysis.",
760
+ "Software Engineer": "Act as a software engineer focusing on code and technical details.",
761
+ "Product Manager": "Act as a product manager considering strategy and user experience.",
762
+ "Data Scientist": "Act as a data scientist emphasizing data analysis.",
763
+ "Business Analyst": "Act as a business analyst considering strategic aspects.",
764
+ "Technical Writer": "Act as a technical writer creating clear documentation.",
765
+ "Marketing Specialist": "Act as a marketing specialist focusing on branding.",
766
+ "HR Manager": "Act as an HR manager considering people aspects.",
767
+ "Legal Advisor": "Act as a legal advisor providing legal perspective.",
768
+ "Custom Instructions": "Enter custom instructions below:"
769
+ }
770
+
771
+ selected_instruction = st.selectbox("Select Assistant Behavior", list(instruction_options.keys()))
772
+ custom_instruction = ""
773
+ if selected_instruction == "Custom Instructions":
774
+ custom_instruction = st.text_area("Enter Custom Instructions", height=100)
775
+
776
+ # Display selected documents for analysis
777
+ st.subheader("Selected Documents for Analysis")
778
+ selected_docs = st.session_state['selected_docs']
779
+
780
+ if selected_docs:
781
+ st.markdown("<ul class='doc-list'>", unsafe_allow_html=True)
782
+ for doc_hash in selected_docs:
783
+ if doc_hash in doc_store.metadata:
784
+ st.markdown(f"<li>📄 {doc_store.metadata[doc_hash]['filename']}</li>", unsafe_allow_html=True)
785
+ st.markdown("</ul>", unsafe_allow_html=True)
786
+
787
+ # Create a centered button
788
+ col1, col2, col3 = st.columns([1, 2, 1])
789
+ with col2:
790
+ analyze_button = st.button("Extract and Analyze Documents", use_container_width=True)
791
+
792
+ # Analysis Results area placeholder
793
+ analysis_results = st.container()
794
+
795
+ if analyze_button:
796
+ # Process the documents and run analysis
797
+ with analysis_results:
798
+ with st.spinner("Analyzing documents..."):
799
+ processed_docs = process_documents(selected_docs)
800
+
801
+ if not processed_docs:
802
+ st.error("No documents could be processed. Please check the file formats and try again.")
803
+ else:
804
+ # Build the prompt
805
+ if selected_prompt_option == "Custom Prompt":
806
+ prompt_text = custom_prompt
807
+ else:
808
+ prompt_text = prompt_options[selected_prompt_option]
809
+
810
+ if selected_instruction == "Custom Instructions":
811
+ instruction_text = custom_instruction
812
+ else:
813
+ instruction_text = instruction_options[selected_instruction]
814
+
815
+ # Add tone guidance
816
+ tone_guidance = f"Use a {selected_tone.lower()} tone in your response."
817
+
818
+ # Add length guidance
819
+ length_guidance = ""
820
+ if selected_length == "Concise":
821
+ length_guidance = "Keep your response brief and to the point."
822
+ elif selected_length == "Detailed":
823
+ length_guidance = "Provide a detailed response with thorough explanations."
824
+ elif selected_length == "Comprehensive":
825
+ length_guidance = "Provide a comprehensive in-depth analysis covering all aspects."
826
+ elif selected_length == "Bullet Points":
827
+ length_guidance = "Format your response primarily using bullet points for clarity."
828
+
829
+ # Set up the output parser
830
+ output_parser = PydanticOutputParser(pydantic_object=DocumentAnalysis)
831
+ format_instructions = output_parser.get_format_instructions()
832
+
833
+ if analysis_mode == "Analyze each document separately":
834
+ results = []
835
+
836
+ for doc in processed_docs:
837
+ with st.spinner(f"Analyzing {doc['name']}..."):
838
+ # Create system message with combined instructions
839
+ system_message = f"{instruction_text} {tone_guidance} {length_guidance} Format your response according to these instructions: {format_instructions}"
840
+
841
+ prompt = f"""
842
+ {prompt_text}
843
+ Document: {doc['name']}
844
+ Content: {doc['data']}
845
+ """
846
+
847
+ try:
848
+ # Create a prompt template
849
+ system_template = f"{instruction_text} {tone_guidance} {length_guidance}"
850
+ messages = [
851
+ SystemMessage(content=system_template),
852
+ SystemMessage(content=f"Format your response according to these instructions: {format_instructions}"),
853
+ HumanMessage(content="{input}")
854
+ ]
855
+ template = ChatPromptTemplate.from_messages(messages)
856
+
857
+ # Get response from LLM
858
+ chain = template | llm
859
+ response = chain.invoke({"input": prompt})
860
+
861
+ # Try to parse the response into the pydantic model
862
+ try:
863
+ # Clean the response before parsing
864
+ cleaned_response = clean_llm_response(response)
865
+ parsed_response = output_parser.parse(cleaned_response)
866
+ results.append({
867
+ "document_name": doc['name'],
868
+ "analysis": parsed_response.dict()
869
+ })
870
+ except Exception as e:
871
+ # If parsing fails, include the raw response
872
+ results.append({
873
+ "document_name": doc['name'],
874
+ "analysis": str(response),
875
+ "parsing_error": str(e)
876
+ })
877
+ except Exception as e:
878
+ st.error(f"Error analyzing {doc['name']}: {str(e)}")
879
+
880
+ # Display results with card-based UI
881
+ for result in results:
882
+ st.markdown(f"<div class='card'>", unsafe_allow_html=True)
883
+ st.markdown(f"<h3>Analysis for: {result['document_name']}</h3>", unsafe_allow_html=True)
884
+
885
+ if isinstance(result['analysis'], dict) and 'parsing_error' not in result:
886
+ # Structured output
887
+ st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
888
+ st.markdown("### Summary")
889
+ st.write(result['analysis']['summary'])
890
+ st.markdown("</div>", unsafe_allow_html=True)
891
+
892
+ st.markdown("### Key Insights")
893
+ for insight in result['analysis']['key_insights']:
894
+ st.markdown(f"- {insight}")
895
+
896
+ if result['analysis'].get('action_items'):
897
+ st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
898
+ st.markdown("### Action Items")
899
+ for item in result['analysis']['action_items']:
900
+ st.markdown(f"- {item}")
901
+ st.markdown("</div>", unsafe_allow_html=True)
902
+
903
+ if result['analysis'].get('open_questions'):
904
+ st.markdown("### Open Questions")
905
+ for question in result['analysis']['open_questions']:
906
+ st.markdown(f"- {question}")
907
+ else:
908
+ # Raw output
909
+ st.markdown(result['analysis'])
910
+ if 'parsing_error' in result:
911
+ st.info(f"Note: The response could not be parsed into the expected format. Error: {result['parsing_error']}")
912
+
913
+ if 'parsing_error' not in result:
914
+ doc_hash = next((doc['hash'] for doc in processed_docs if doc['name'] == result['document_name']), None)
915
+ if doc_hash:
916
+ doc_store.add_analysis_result(doc_hash, result['analysis'])
917
+ st.session_state['analyzed_docs'].add(doc_hash)
918
+
919
+ if results:
920
+ st.markdown("---")
921
+ if st.button("Chat with these documents"):
922
+ # Switch to the chat tab
923
+ st.session_state['active_tab'] = "Chat with Documents"
924
+ st.rerun()
925
+
926
+ st.markdown("</div>", unsafe_allow_html=True)
927
+
928
+ else: # Combined analysis for all documents
929
+ with st.spinner("Analyzing all documents together..."):
930
+ # Combine all documents
931
+ combined_docs = []
932
+
933
+ for doc in processed_docs:
934
+ doc_content = f"Document: {doc['name']}\n\nContent: {doc['data']}"
935
+ combined_docs.append(doc_content)
936
+
937
+ combined_content = "\n\n" + "\n\n---\n\n".join(combined_docs)
938
+
939
+ # Create system message with combined instructions
940
+ system_message = f"{instruction_text} {tone_guidance} {length_guidance} Format your response according to these instructions: {format_instructions}"
941
+
942
+ # Create the prompt template
943
+ template = ChatPromptTemplate.from_messages([
944
+ ("system", system_message),
945
+ ("human", "{input}")
946
+ ])
947
+
948
+ # Create the prompt
949
+ prompt = f"""
950
+ {prompt_text}
951
+ {combined_content}
952
+ """
953
+
954
+ try:
955
+ chain = template | llm
956
+ response = chain.invoke({"input": prompt})
957
+
958
+ # Try to parse the response into the pydantic model
959
+ try:
960
+ cleaned_response = clean_llm_response(response)
961
+ parsed_response = output_parser.parse(cleaned_response)
962
+
963
+ st.markdown("<div class='card'>", unsafe_allow_html=True)
964
+ st.markdown("<h3>Combined Analysis for All Documents</h3>", unsafe_allow_html=True)
965
+
966
+ st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
967
+ st.markdown("### Summary")
968
+ st.write(parsed_response.summary)
969
+ st.markdown("</div>", unsafe_allow_html=True)
970
+
971
+ st.markdown("### Key Insights")
972
+ for insight in parsed_response.key_insights:
973
+ st.markdown(f"- {insight}")
974
+
975
+ if parsed_response.action_items:
976
+ st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
977
+ st.markdown("### Action Items")
978
+ for item in parsed_response.action_items:
979
+ st.markdown(f"- {item}")
980
+ st.markdown("</div>", unsafe_allow_html=True)
981
+
982
+ if parsed_response.open_questions:
983
+ st.markdown("### Open Questions")
984
+ for question in parsed_response.open_questions:
985
+ st.markdown(f"- {question}")
986
+
987
+ if parsed_response:
988
+ # Store the combined analysis
989
+ doc_store.add_combined_analysis([doc['hash'] for doc in processed_docs], parsed_response.dict())
990
+ session_id = get_session_id([doc['hash'] for doc in processed_docs])
991
+ st.session_state['analyzed_combinations'].add(session_id)
992
+
993
+ # Add button to chat with these documents
994
+ st.markdown("---")
995
+ if st.button("Chat with these documents"):
996
+ # Switch to the chat tab
997
+ st.session_state['active_tab'] = "Chat with Documents"
998
+ st.rerun()
999
+
1000
+ st.markdown("</div>", unsafe_allow_html=True)
1001
+
1002
+ except Exception as e:
1003
+ # If parsing fails, display raw response
1004
+ st.markdown("<div class='card'>", unsafe_allow_html=True)
1005
+ st.markdown("<h3>Combined Analysis for All Documents</h3>", unsafe_allow_html=True)
1006
+ st.markdown(str(response))
1007
+ st.info(f"Note: The response could not be parsed into the expected format. Error: {str(e)}")
1008
+ st.markdown("</div>", unsafe_allow_html=True)
1009
+
1010
+ except Exception as e:
1011
+ st.error(f"Error analyzing documents: {str(e)}")
1012
+
1013
+ # Tab 3: Chat with Documents
1014
+ with tab3:
1015
+ st.header("Chat with Documents")
1016
+
1017
+ # Display selected documents for chat
1018
+ st.subheader("Selected Documents")
1019
+ selected = st.session_state['selected_docs']
1020
+
1021
+ if selected:
1022
+ # Display selected documents
1023
+ st.markdown("<ul class='doc-list'>", unsafe_allow_html=True)
1024
+ for doc_hash in selected:
1025
+ if doc_hash in doc_store.metadata:
1026
+ doc_name = doc_store.metadata[doc_hash]["filename"]
1027
+ analyzed_status = "✅ (Analyzed)" if doc_hash in st.session_state['analyzed_docs'] else "📄"
1028
+ st.markdown(f"<li>{analyzed_status} {doc_name}</li>", unsafe_allow_html=True)
1029
+ st.markdown("</ul>", unsafe_allow_html=True)
1030
+
1031
+ # Check if all documents have been analyzed
1032
+ all_analyzed = all(doc_hash in st.session_state['analyzed_docs'] for doc_hash in selected)
1033
+ session_id = get_session_id(selected)
1034
+ has_combined_analysis = session_id in st.session_state['analyzed_combinations']
1035
+
1036
+ # Show analysis results if available
1037
+ if has_combined_analysis:
1038
+ with st.expander("View Combined Analysis Results", expanded=False):
1039
+ combined_analysis = doc_store.get_combined_analysis(selected)
1040
+ if combined_analysis:
1041
+ # Display the combined analysis
1042
+ st.markdown("<div class='card'>", unsafe_allow_html=True)
1043
+ st.markdown("<h3>Combined Analysis for All Documents</h3>", unsafe_allow_html=True)
1044
+
1045
+ st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
1046
+ st.markdown("### Summary")
1047
+ st.write(combined_analysis['summary'])
1048
+ st.markdown("</div>", unsafe_allow_html=True)
1049
+
1050
+ st.markdown("### Key Insights")
1051
+ for insight in combined_analysis['key_insights']:
1052
+ st.markdown(f"- {insight}")
1053
+
1054
+ if combined_analysis.get('action_items'):
1055
+ st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
1056
+ st.markdown("### Action Items")
1057
+ for item in combined_analysis['action_items']:
1058
+ st.markdown(f"- {item}")
1059
+ st.markdown("</div>", unsafe_allow_html=True)
1060
+
1061
+ if combined_analysis.get('open_questions'):
1062
+ st.markdown("### Open Questions")
1063
+ for question in combined_analysis['open_questions']:
1064
+ st.markdown(f"- {question}")
1065
+
1066
+ st.markdown("</div>", unsafe_allow_html=True)
1067
+
1068
+ # Check if chat is already set up for these documents
1069
+ session_id = get_session_id(selected)
1070
+
1071
+ if session_id not in st.session_state.get('chat_sessions', {}):
1072
+ # If documents have been analyzed, show a message
1073
+ if all_analyzed or has_combined_analysis:
1074
+ st.info("Documents have been analyzed. Setting up chat functionality...")
1075
+
1076
+ # Process documents and set up chat
1077
+ processed_docs = process_documents(selected)
1078
+ if processed_docs:
1079
+ new_session_id = setup_document_chat(processed_docs)
1080
+ if new_session_id:
1081
+ session_id = new_session_id
1082
+ st.success("Chat is ready! Ask questions about your documents below.")
1083
+ else:
1084
+ st.error("Failed to set up chat for these documents.")
1085
+ st.stop()
1086
+ else:
1087
+ st.error("Could not process the selected documents.")
1088
+ st.stop()
1089
+
1090
+ # Chat interface
1091
+ st.markdown("<div class='card'>", unsafe_allow_html=True)
1092
+ user_question = st.text_input("Ask a question about your documents:")
1093
+
1094
+ # Use session history
1095
+ if session_id in st.session_state['session_history']:
1096
+ # Display chat history
1097
+ for exchange in st.session_state['session_history'][session_id]:
1098
+ st.markdown("<div class='chat-container'>", unsafe_allow_html=True)
1099
+ st.markdown(f"<div class='chat-message chat-user'><strong>You:</strong> {exchange['question']}</div>", unsafe_allow_html=True)
1100
+ st.markdown(f"<div class='chat-message chat-ai'><strong>DocMind AI:</strong> {exchange['answer']}</div>", unsafe_allow_html=True)
1101
+ st.markdown("</div>", unsafe_allow_html=True)
1102
+
1103
+ if user_question:
1104
+ with st.spinner("Generating response..."):
1105
+ try:
1106
+ # Get the QA function for this session
1107
+ qa_function = st.session_state['chat_sessions'][session_id]
1108
+ response = qa_function(user_question)
1109
+
1110
+ # Add to session history
1111
+ if session_id not in st.session_state['session_history']:
1112
+ st.session_state['session_history'][session_id] = []
1113
+
1114
+ st.session_state['session_history'][session_id].append({
1115
+ "question": user_question,
1116
+ "answer": response['answer']
1117
+ })
1118
+
1119
+ # Force refresh to show new message
1120
+ st.rerun()
1121
+
1122
+ except Exception as e:
1123
+ st.error(f"Error generating response: {str(e)}")
1124
+
1125
+ st.markdown("</div>", unsafe_allow_html=True)
1126
+
1127
+ # Option to clear chat history
1128
+ if session_id in st.session_state['session_history'] and st.session_state['session_history'][session_id]:
1129
+ if st.button("Clear Chat History"):
1130
+ st.session_state['session_history'][session_id] = []
1131
+ st.success("Chat history cleared!")
1132
+ st.rerun()
1133
+ else:
1134
+ st.info("Please select documents from the 'Upload & Manage Documents' tab first.")
1135
+
1136
+ # Footer
1137
+ st.markdown("---")
1138
+ st.markdown(
1139
+ """
1140
+ <div style="text-align: center">
1141
+ <p>Built with ❤️ using Streamlit, LangChain, and Gemini model</p>
1142
+ <p>DocMind AI - AI-Powered Document Analysis</p>
1143
+ </div>
1144
+ """,
1145
+ unsafe_allow_html=True
1146
+ )
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ chromadb
2
+ fastapi
3
+ langchain
4
+ langchain-community
5
+ langchain-core
6
+ langchain-text-splitters
7
+ langchain_openai
8
+ langdetect
9
+ langsmith
10
+ numpy
11
+ openai
12
+ pandas
13
+ pdf2image
14
+ pillow
15
+ pypdf
16
+ PyPika
17
+ python-docx
18
+ python-pptx
19
+ streamlit