Spaces:
Runtime error
Runtime error
Commit ·
76cdde2
1
Parent(s): cdadb63
Bug fixes
Browse files- app.py +14 -2
- document_processor.py +7 -6
app.py
CHANGED
|
@@ -120,6 +120,8 @@ def initialize_session_state():
|
|
| 120 |
st.session_state.analysis_result = None
|
| 121 |
if 'document_loaded' not in st.session_state:
|
| 122 |
st.session_state.document_loaded = False
|
|
|
|
|
|
|
| 123 |
|
| 124 |
|
| 125 |
def display_criteria_rules(criteria):
|
|
@@ -281,13 +283,23 @@ def main():
|
|
| 281 |
)
|
| 282 |
|
| 283 |
if uploaded_file is not None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
# Save to temp file
|
| 285 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
|
| 286 |
tmp_file.write(uploaded_file.getvalue())
|
| 287 |
tmp_path = tmp_file.name
|
| 288 |
|
| 289 |
-
# Load document
|
| 290 |
-
if not st.session_state.document_loaded
|
| 291 |
with st.spinner("Loading and indexing document..."):
|
| 292 |
try:
|
| 293 |
processor = InvestmentDocumentProcessor(api_key)
|
|
|
|
| 120 |
st.session_state.analysis_result = None
|
| 121 |
if 'document_loaded' not in st.session_state:
|
| 122 |
st.session_state.document_loaded = False
|
| 123 |
+
if 'current_file_name' not in st.session_state:
|
| 124 |
+
st.session_state.current_file_name = None
|
| 125 |
|
| 126 |
|
| 127 |
def display_criteria_rules(criteria):
|
|
|
|
| 283 |
)
|
| 284 |
|
| 285 |
if uploaded_file is not None:
|
| 286 |
+
# Check if file has changed
|
| 287 |
+
file_changed = (uploaded_file.name != st.session_state.current_file_name)
|
| 288 |
+
|
| 289 |
+
if file_changed:
|
| 290 |
+
# Reset session state for new file
|
| 291 |
+
st.session_state.current_file_name = uploaded_file.name
|
| 292 |
+
st.session_state.document_loaded = False
|
| 293 |
+
st.session_state.analysis_result = None
|
| 294 |
+
st.session_state.processor = None
|
| 295 |
+
|
| 296 |
# Save to temp file
|
| 297 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
|
| 298 |
tmp_file.write(uploaded_file.getvalue())
|
| 299 |
tmp_path = tmp_file.name
|
| 300 |
|
| 301 |
+
# Load document if not already loaded
|
| 302 |
+
if not st.session_state.document_loaded:
|
| 303 |
with st.spinner("Loading and indexing document..."):
|
| 304 |
try:
|
| 305 |
processor = InvestmentDocumentProcessor(api_key)
|
document_processor.py
CHANGED
|
@@ -28,9 +28,10 @@ class InvestmentDocumentProcessor:
|
|
| 28 |
Settings.llm = self.llm
|
| 29 |
|
| 30 |
# Node parser to chunk documents while preserving metadata
|
|
|
|
| 31 |
self.node_parser = SimpleNodeParser.from_defaults(
|
| 32 |
-
chunk_size=
|
| 33 |
-
chunk_overlap=
|
| 34 |
)
|
| 35 |
|
| 36 |
self.index = None
|
|
@@ -69,8 +70,8 @@ class InvestmentDocumentProcessor:
|
|
| 69 |
|
| 70 |
# Create query engine with citation tracking
|
| 71 |
query_engine = self.index.as_query_engine(
|
| 72 |
-
similarity_top_k=
|
| 73 |
-
response_mode="
|
| 74 |
)
|
| 75 |
|
| 76 |
# Query with the criteria prompt
|
|
@@ -107,9 +108,9 @@ class InvestmentDocumentProcessor:
|
|
| 107 |
"index": idx + 1,
|
| 108 |
"page": page,
|
| 109 |
"score": node.score,
|
| 110 |
-
"text_preview": node.node.text[:
|
| 111 |
"full_text": node.node.text,
|
| 112 |
-
"is_truncated": len(node.node.text) >
|
| 113 |
"file_name": node.node.metadata.get('file_name', 'Unknown')
|
| 114 |
}
|
| 115 |
citations.append(citation)
|
|
|
|
| 28 |
Settings.llm = self.llm
|
| 29 |
|
| 30 |
# Node parser to chunk documents while preserving metadata
|
| 31 |
+
# Larger chunks to capture complete financial statements/tables
|
| 32 |
self.node_parser = SimpleNodeParser.from_defaults(
|
| 33 |
+
chunk_size=2048,
|
| 34 |
+
chunk_overlap=400
|
| 35 |
)
|
| 36 |
|
| 37 |
self.index = None
|
|
|
|
| 70 |
|
| 71 |
# Create query engine with citation tracking
|
| 72 |
query_engine = self.index.as_query_engine(
|
| 73 |
+
similarity_top_k=20, # Increased to get more diverse context
|
| 74 |
+
response_mode="compact" # More focused on relevant chunks
|
| 75 |
)
|
| 76 |
|
| 77 |
# Query with the criteria prompt
|
|
|
|
| 108 |
"index": idx + 1,
|
| 109 |
"page": page,
|
| 110 |
"score": node.score,
|
| 111 |
+
"text_preview": node.node.text[:350] + "..." if len(node.node.text) > 350 else node.node.text,
|
| 112 |
"full_text": node.node.text,
|
| 113 |
+
"is_truncated": len(node.node.text) > 350,
|
| 114 |
"file_name": node.node.metadata.get('file_name', 'Unknown')
|
| 115 |
}
|
| 116 |
citations.append(citation)
|