kritsadaK commited on
Commit
d59e674
·
verified ·
1 Parent(s): 2afc718

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -13
app.py CHANGED
@@ -11,6 +11,9 @@ os.environ["TORCH_HOME"] = "/home/user/.cache/torch"
11
  os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
12
 
13
  import torch
 
 
 
14
 
15
  # Set Streamlit page config
16
  st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide")
@@ -160,10 +163,8 @@ def split_into_chunks(text, max_tokens=1024):
160
  Returns:
161
  List[str]: A list of text chunks.
162
  """
163
- import nltk
164
- from nltk.tokenize import sent_tokenize
165
 
166
- # Download the Punkt tokenizer if not already downloaded
167
  nltk.download('punkt', quiet=True)
168
 
169
  sentences = sent_tokenize(text)
@@ -189,6 +190,7 @@ def split_into_chunks(text, max_tokens=1024):
189
  return chunks
190
 
191
 
 
192
  # Ensure session state is initialized
193
  if "pdf_text" not in st.session_state:
194
  st.session_state["pdf_text"] = ""
@@ -285,8 +287,10 @@ else:
285
 
286
  # Step 4: Summarization (Using full text)
287
 
 
288
  st.subheader("Summarization")
289
 
 
290
  input_text = st.text_area(
291
  "Enter text to summarize",
292
  height=400,
@@ -295,7 +299,8 @@ else:
295
 
296
  if st.button("Summarize"):
297
  text_to_summarize = input_text.strip()
298
- text_to_summarize = re.sub(r'\s+', ' ', text_to_summarize)
 
299
  st.write(f"Original text length: {len(text_to_summarize.split())} words")
300
 
301
  if not text_to_summarize:
@@ -305,25 +310,35 @@ else:
305
  with st.spinner("Generating summary..."):
306
  # Split text into manageable chunks
307
  chunks = split_into_chunks(text_to_summarize)
 
 
308
  summaries = []
309
 
310
  for i, chunk in enumerate(chunks):
311
  st.write(f"Summarizing chunk {i+1}/{len(chunks)}")
312
  summary_output = summarizer(
313
  chunk,
314
- max_length=150,
315
- min_length=50,
316
  do_sample=False,
317
  truncation=True
318
  )
319
- summary = summary_output[0]['summary_text'].strip()
320
- summaries.append(summary)
321
 
322
- # Combine summaries
323
- final_summary = ' '.join(summaries)
324
- st.write("Final Summary:")
325
- st.success(final_summary)
 
 
 
 
 
 
 
 
 
 
326
 
327
  except Exception as e:
328
  st.error(f"Summarization failed: {e}")
329
- st.text(traceback.format_exc())
 
11
  os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
12
 
13
  import torch
14
+ import nltk
15
+ from nltk.tokenize import sent_tokenize
16
+ import traceback
17
 
18
  # Set Streamlit page config
19
  st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide")
 
163
  Returns:
164
  List[str]: A list of text chunks.
165
  """
 
 
166
 
167
+ # Ensure NLTK 'punkt' resource is downloaded
168
  nltk.download('punkt', quiet=True)
169
 
170
  sentences = sent_tokenize(text)
 
190
  return chunks
191
 
192
 
193
+
194
  # Ensure session state is initialized
195
  if "pdf_text" not in st.session_state:
196
  st.session_state["pdf_text"] = ""
 
287
 
288
  # Step 4: Summarization (Using full text)
289
 
290
+
291
  st.subheader("Summarization")
292
 
293
+ # Display full extracted text
294
  input_text = st.text_area(
295
  "Enter text to summarize",
296
  height=400,
 
299
 
300
  if st.button("Summarize"):
301
  text_to_summarize = input_text.strip()
302
+ text_to_summarize = re.sub(r'\s+', ' ', text_to_summarize) # Replace multiple whitespaces with a single space
303
+
304
  st.write(f"Original text length: {len(text_to_summarize.split())} words")
305
 
306
  if not text_to_summarize:
 
310
  with st.spinner("Generating summary..."):
311
  # Split text into manageable chunks
312
  chunks = split_into_chunks(text_to_summarize)
313
+ st.write(f"Text has been split into {len(chunks)} chunks.")
314
+
315
  summaries = []
316
 
317
  for i, chunk in enumerate(chunks):
318
  st.write(f"Summarizing chunk {i+1}/{len(chunks)}")
319
  summary_output = summarizer(
320
  chunk,
321
+ max_length=150, # Adjust as needed
322
+ min_length=50, # Adjust as needed
323
  do_sample=False,
324
  truncation=True
325
  )
 
 
326
 
327
+ if summary_output and 'summary_text' in summary_output[0]:
328
+ summary = summary_output[0]['summary_text'].strip()
329
+ summaries.append(summary)
330
+ else:
331
+ st.error(f"Summarization failed for chunk {i+1}: No summary text returned.")
332
+ continue
333
+
334
+ if summaries:
335
+ # Combine summaries
336
+ final_summary = ' '.join(summaries)
337
+ st.write("Final Summary:")
338
+ st.success(final_summary)
339
+ else:
340
+ st.error("No summaries were generated.")
341
 
342
  except Exception as e:
343
  st.error(f"Summarization failed: {e}")
344
+ st.text(traceback.format_exc())