Update app.py
Browse files
app.py
CHANGED
|
@@ -11,6 +11,9 @@ os.environ["TORCH_HOME"] = "/home/user/.cache/torch"
|
|
| 11 |
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
|
| 12 |
|
| 13 |
import torch
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# Set Streamlit page config
|
| 16 |
st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide")
|
|
@@ -160,10 +163,8 @@ def split_into_chunks(text, max_tokens=1024):
|
|
| 160 |
Returns:
|
| 161 |
List[str]: A list of text chunks.
|
| 162 |
"""
|
| 163 |
-
import nltk
|
| 164 |
-
from nltk.tokenize import sent_tokenize
|
| 165 |
|
| 166 |
-
#
|
| 167 |
nltk.download('punkt', quiet=True)
|
| 168 |
|
| 169 |
sentences = sent_tokenize(text)
|
|
@@ -189,6 +190,7 @@ def split_into_chunks(text, max_tokens=1024):
|
|
| 189 |
return chunks
|
| 190 |
|
| 191 |
|
|
|
|
| 192 |
# Ensure session state is initialized
|
| 193 |
if "pdf_text" not in st.session_state:
|
| 194 |
st.session_state["pdf_text"] = ""
|
|
@@ -285,8 +287,10 @@ else:
|
|
| 285 |
|
| 286 |
# Step 4: Summarization (Using full text)
|
| 287 |
|
|
|
|
| 288 |
st.subheader("Summarization")
|
| 289 |
|
|
|
|
| 290 |
input_text = st.text_area(
|
| 291 |
"Enter text to summarize",
|
| 292 |
height=400,
|
|
@@ -295,7 +299,8 @@ else:
|
|
| 295 |
|
| 296 |
if st.button("Summarize"):
|
| 297 |
text_to_summarize = input_text.strip()
|
| 298 |
-
text_to_summarize = re.sub(r'\s+', ' ', text_to_summarize)
|
|
|
|
| 299 |
st.write(f"Original text length: {len(text_to_summarize.split())} words")
|
| 300 |
|
| 301 |
if not text_to_summarize:
|
|
@@ -305,25 +310,35 @@ else:
|
|
| 305 |
with st.spinner("Generating summary..."):
|
| 306 |
# Split text into manageable chunks
|
| 307 |
chunks = split_into_chunks(text_to_summarize)
|
|
|
|
|
|
|
| 308 |
summaries = []
|
| 309 |
|
| 310 |
for i, chunk in enumerate(chunks):
|
| 311 |
st.write(f"Summarizing chunk {i+1}/{len(chunks)}")
|
| 312 |
summary_output = summarizer(
|
| 313 |
chunk,
|
| 314 |
-
max_length=150,
|
| 315 |
-
min_length=50,
|
| 316 |
do_sample=False,
|
| 317 |
truncation=True
|
| 318 |
)
|
| 319 |
-
summary = summary_output[0]['summary_text'].strip()
|
| 320 |
-
summaries.append(summary)
|
| 321 |
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
except Exception as e:
|
| 328 |
st.error(f"Summarization failed: {e}")
|
| 329 |
-
st.text(traceback.format_exc())
|
|
|
|
| 11 |
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
|
| 12 |
|
| 13 |
import torch
|
| 14 |
+
import nltk
|
| 15 |
+
from nltk.tokenize import sent_tokenize
|
| 16 |
+
import traceback
|
| 17 |
|
| 18 |
# Set Streamlit page config
|
| 19 |
st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide")
|
|
|
|
| 163 |
Returns:
|
| 164 |
List[str]: A list of text chunks.
|
| 165 |
"""
|
|
|
|
|
|
|
| 166 |
|
| 167 |
+
# Ensure NLTK 'punkt' resource is downloaded
|
| 168 |
nltk.download('punkt', quiet=True)
|
| 169 |
|
| 170 |
sentences = sent_tokenize(text)
|
|
|
|
| 190 |
return chunks
|
| 191 |
|
| 192 |
|
| 193 |
+
|
| 194 |
# Ensure session state is initialized
|
| 195 |
if "pdf_text" not in st.session_state:
|
| 196 |
st.session_state["pdf_text"] = ""
|
|
|
|
| 287 |
|
| 288 |
# Step 4: Summarization (Using full text)
|
| 289 |
|
| 290 |
+
|
| 291 |
st.subheader("Summarization")
|
| 292 |
|
| 293 |
+
# Display full extracted text
|
| 294 |
input_text = st.text_area(
|
| 295 |
"Enter text to summarize",
|
| 296 |
height=400,
|
|
|
|
| 299 |
|
| 300 |
if st.button("Summarize"):
|
| 301 |
text_to_summarize = input_text.strip()
|
| 302 |
+
text_to_summarize = re.sub(r'\s+', ' ', text_to_summarize) # Replace multiple whitespaces with a single space
|
| 303 |
+
|
| 304 |
st.write(f"Original text length: {len(text_to_summarize.split())} words")
|
| 305 |
|
| 306 |
if not text_to_summarize:
|
|
|
|
| 310 |
with st.spinner("Generating summary..."):
|
| 311 |
# Split text into manageable chunks
|
| 312 |
chunks = split_into_chunks(text_to_summarize)
|
| 313 |
+
st.write(f"Text has been split into {len(chunks)} chunks.")
|
| 314 |
+
|
| 315 |
summaries = []
|
| 316 |
|
| 317 |
for i, chunk in enumerate(chunks):
|
| 318 |
st.write(f"Summarizing chunk {i+1}/{len(chunks)}")
|
| 319 |
summary_output = summarizer(
|
| 320 |
chunk,
|
| 321 |
+
max_length=150, # Adjust as needed
|
| 322 |
+
min_length=50, # Adjust as needed
|
| 323 |
do_sample=False,
|
| 324 |
truncation=True
|
| 325 |
)
|
|
|
|
|
|
|
| 326 |
|
| 327 |
+
if summary_output and 'summary_text' in summary_output[0]:
|
| 328 |
+
summary = summary_output[0]['summary_text'].strip()
|
| 329 |
+
summaries.append(summary)
|
| 330 |
+
else:
|
| 331 |
+
st.error(f"Summarization failed for chunk {i+1}: No summary text returned.")
|
| 332 |
+
continue
|
| 333 |
+
|
| 334 |
+
if summaries:
|
| 335 |
+
# Combine summaries
|
| 336 |
+
final_summary = ' '.join(summaries)
|
| 337 |
+
st.write("Final Summary:")
|
| 338 |
+
st.success(final_summary)
|
| 339 |
+
else:
|
| 340 |
+
st.error("No summaries were generated.")
|
| 341 |
|
| 342 |
except Exception as e:
|
| 343 |
st.error(f"Summarization failed: {e}")
|
| 344 |
+
st.text(traceback.format_exc())
|