sukhrobnurali commited on
Commit
76cdde2
·
1 Parent(s): cdadb63

Bug fixes

Browse files
Files changed (2) hide show
  1. app.py +14 -2
  2. document_processor.py +7 -6
app.py CHANGED
@@ -120,6 +120,8 @@ def initialize_session_state():
120
  st.session_state.analysis_result = None
121
  if 'document_loaded' not in st.session_state:
122
  st.session_state.document_loaded = False
 
 
123
 
124
 
125
  def display_criteria_rules(criteria):
@@ -281,13 +283,23 @@ def main():
281
  )
282
 
283
  if uploaded_file is not None:
 
 
 
 
 
 
 
 
 
 
284
  # Save to temp file
285
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
286
  tmp_file.write(uploaded_file.getvalue())
287
  tmp_path = tmp_file.name
288
 
289
- # Load document
290
- if not st.session_state.document_loaded or st.session_state.processor is None:
291
  with st.spinner("Loading and indexing document..."):
292
  try:
293
  processor = InvestmentDocumentProcessor(api_key)
 
120
  st.session_state.analysis_result = None
121
  if 'document_loaded' not in st.session_state:
122
  st.session_state.document_loaded = False
123
+ if 'current_file_name' not in st.session_state:
124
+ st.session_state.current_file_name = None
125
 
126
 
127
  def display_criteria_rules(criteria):
 
283
  )
284
 
285
  if uploaded_file is not None:
286
+ # Check if file has changed
287
+ file_changed = (uploaded_file.name != st.session_state.current_file_name)
288
+
289
+ if file_changed:
290
+ # Reset session state for new file
291
+ st.session_state.current_file_name = uploaded_file.name
292
+ st.session_state.document_loaded = False
293
+ st.session_state.analysis_result = None
294
+ st.session_state.processor = None
295
+
296
  # Save to temp file
297
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
298
  tmp_file.write(uploaded_file.getvalue())
299
  tmp_path = tmp_file.name
300
 
301
+ # Load document if not already loaded
302
+ if not st.session_state.document_loaded:
303
  with st.spinner("Loading and indexing document..."):
304
  try:
305
  processor = InvestmentDocumentProcessor(api_key)
document_processor.py CHANGED
@@ -28,9 +28,10 @@ class InvestmentDocumentProcessor:
28
  Settings.llm = self.llm
29
 
30
  # Node parser to chunk documents while preserving metadata
 
31
  self.node_parser = SimpleNodeParser.from_defaults(
32
- chunk_size=1024,
33
- chunk_overlap=200
34
  )
35
 
36
  self.index = None
@@ -69,8 +70,8 @@ class InvestmentDocumentProcessor:
69
 
70
  # Create query engine with citation tracking
71
  query_engine = self.index.as_query_engine(
72
- similarity_top_k=10, # Get more context
73
- response_mode="tree_summarize"
74
  )
75
 
76
  # Query with the criteria prompt
@@ -107,9 +108,9 @@ class InvestmentDocumentProcessor:
107
  "index": idx + 1,
108
  "page": page,
109
  "score": node.score,
110
- "text_preview": node.node.text[:200] + "..." if len(node.node.text) > 200 else node.node.text,
111
  "full_text": node.node.text,
112
- "is_truncated": len(node.node.text) > 200,
113
  "file_name": node.node.metadata.get('file_name', 'Unknown')
114
  }
115
  citations.append(citation)
 
28
  Settings.llm = self.llm
29
 
30
  # Node parser to chunk documents while preserving metadata
31
+ # Larger chunks to capture complete financial statements/tables
32
  self.node_parser = SimpleNodeParser.from_defaults(
33
+ chunk_size=2048,
34
+ chunk_overlap=400
35
  )
36
 
37
  self.index = None
 
70
 
71
  # Create query engine with citation tracking
72
  query_engine = self.index.as_query_engine(
73
+ similarity_top_k=20, # Increased to get more diverse context
74
+ response_mode="compact" # More focused on relevant chunks
75
  )
76
 
77
  # Query with the criteria prompt
 
108
  "index": idx + 1,
109
  "page": page,
110
  "score": node.score,
111
+ "text_preview": node.node.text[:350] + "..." if len(node.node.text) > 350 else node.node.text,
112
  "full_text": node.node.text,
113
+ "is_truncated": len(node.node.text) > 350,
114
  "file_name": node.node.metadata.get('file_name', 'Unknown')
115
  }
116
  citations.append(citation)