Lohith01 commited on
Commit
edeb1a2
·
verified ·
1 Parent(s): 4287bf3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -7
app.py CHANGED
@@ -5,29 +5,33 @@ import os
5
  import streamlit as st
6
 
7
  def extract_pdf_text(pdf_file):
 
8
  reader = PyPDF2.PdfReader(pdf_file)
9
  text = ''
10
  for page in reader.pages:
11
- text += page.extract_text()
12
- return text
 
 
13
 
14
  def extract_excel_text(excel_file):
15
  workbook = openpyxl.load_workbook(excel_file)
16
  sheet = workbook.active
17
  text = ''
18
  for row in sheet.iter_rows(values_only=True):
 
19
  row_text = ' '.join([str(cell) for cell in row if cell is not None])
20
- text += row_text + '\n'
21
- return text
22
 
23
  def extract_html_text(html_file):
24
  soup = BeautifulSoup(html_file, 'html.parser')
25
- text = soup.get_text() # Extract visible text
26
- return text
27
 
28
  def extract_txt_text(txt_file):
29
  text = txt_file.read().decode('utf-8') # Read entire text file and decode
30
- return text
31
 
32
  def process_file(file):
33
  extension = os.path.splitext(file.name)[1].lower()
 
5
  import streamlit as st
6
 
7
  def extract_pdf_text(pdf_file):
8
+ # Read all pages of the PDF file
9
  reader = PyPDF2.PdfReader(pdf_file)
10
  text = ''
11
  for page in reader.pages:
12
+ page_text = page.extract_text()
13
+ if page_text: # Ensure there's text to append
14
+ text += page_text + '\n' # Add a newline to separate pages
15
+ return text.strip() # Remove trailing whitespace
16
 
17
  def extract_excel_text(excel_file):
18
  workbook = openpyxl.load_workbook(excel_file)
19
  sheet = workbook.active
20
  text = ''
21
  for row in sheet.iter_rows(values_only=True):
22
+ # Concatenate all cells in the row, ensuring no cells are skipped
23
  row_text = ' '.join([str(cell) for cell in row if cell is not None])
24
+ text += row_text + '\n' # Newline for each row
25
+ return text.strip() # Remove trailing whitespace
26
 
27
  def extract_html_text(html_file):
28
  soup = BeautifulSoup(html_file, 'html.parser')
29
+ text = soup.get_text(separator='\n') # Use separator to maintain line breaks
30
+ return text.strip() # Remove trailing whitespace
31
 
32
  def extract_txt_text(txt_file):
33
  text = txt_file.read().decode('utf-8') # Read entire text file and decode
34
+ return text.strip() # Remove trailing whitespace
35
 
36
  def process_file(file):
37
  extension = os.path.splitext(file.name)[1].lower()