Update app.py
Browse files
app.py
CHANGED
|
@@ -5,29 +5,33 @@ import os
|
|
| 5 |
import streamlit as st
|
| 6 |
|
| 7 |
def extract_pdf_text(pdf_file):
|
|
|
|
| 8 |
reader = PyPDF2.PdfReader(pdf_file)
|
| 9 |
text = ''
|
| 10 |
for page in reader.pages:
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def extract_excel_text(excel_file):
|
| 15 |
workbook = openpyxl.load_workbook(excel_file)
|
| 16 |
sheet = workbook.active
|
| 17 |
text = ''
|
| 18 |
for row in sheet.iter_rows(values_only=True):
|
|
|
|
| 19 |
row_text = ' '.join([str(cell) for cell in row if cell is not None])
|
| 20 |
-
text += row_text + '\n'
|
| 21 |
-
return text
|
| 22 |
|
| 23 |
def extract_html_text(html_file):
|
| 24 |
soup = BeautifulSoup(html_file, 'html.parser')
|
| 25 |
-
text = soup.get_text() #
|
| 26 |
-
return text
|
| 27 |
|
| 28 |
def extract_txt_text(txt_file):
|
| 29 |
text = txt_file.read().decode('utf-8') # Read entire text file and decode
|
| 30 |
-
return text
|
| 31 |
|
| 32 |
def process_file(file):
|
| 33 |
extension = os.path.splitext(file.name)[1].lower()
|
|
|
|
| 5 |
import streamlit as st
|
| 6 |
|
| 7 |
def extract_pdf_text(pdf_file):
|
| 8 |
+
# Read all pages of the PDF file
|
| 9 |
reader = PyPDF2.PdfReader(pdf_file)
|
| 10 |
text = ''
|
| 11 |
for page in reader.pages:
|
| 12 |
+
page_text = page.extract_text()
|
| 13 |
+
if page_text: # Ensure there's text to append
|
| 14 |
+
text += page_text + '\n' # Add a newline to separate pages
|
| 15 |
+
return text.strip() # Remove trailing whitespace
|
| 16 |
|
| 17 |
def extract_excel_text(excel_file):
|
| 18 |
workbook = openpyxl.load_workbook(excel_file)
|
| 19 |
sheet = workbook.active
|
| 20 |
text = ''
|
| 21 |
for row in sheet.iter_rows(values_only=True):
|
| 22 |
+
# Concatenate all cells in the row, ensuring no cells are skipped
|
| 23 |
row_text = ' '.join([str(cell) for cell in row if cell is not None])
|
| 24 |
+
text += row_text + '\n' # Newline for each row
|
| 25 |
+
return text.strip() # Remove trailing whitespace
|
| 26 |
|
| 27 |
def extract_html_text(html_file):
|
| 28 |
soup = BeautifulSoup(html_file, 'html.parser')
|
| 29 |
+
text = soup.get_text(separator='\n') # Use separator to maintain line breaks
|
| 30 |
+
return text.strip() # Remove trailing whitespace
|
| 31 |
|
| 32 |
def extract_txt_text(txt_file):
|
| 33 |
text = txt_file.read().decode('utf-8') # Read entire text file and decode
|
| 34 |
+
return text.strip() # Remove trailing whitespace
|
| 35 |
|
| 36 |
def process_file(file):
|
| 37 |
extension = os.path.splitext(file.name)[1].lower()
|