Spaces:

Lohith01
/

text

Sleeping

Lohith01 commited on Sep 24, 2024

Commit

edeb1a2

verified ·

1 Parent(s): 4287bf3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,29 +5,33 @@ import os
 import streamlit as st
 def extract_pdf_text(pdf_file):
     reader = PyPDF2.PdfReader(pdf_file)
     text = ''
     for page in reader.pages:
-        text += page.extract_text()
-    return text
 def extract_excel_text(excel_file):
     workbook = openpyxl.load_workbook(excel_file)
     sheet = workbook.active
     text = ''
     for row in sheet.iter_rows(values_only=True):
         row_text = ' '.join([str(cell) for cell in row if cell is not None])
-        text += row_text + '\n'
-    return text
 def extract_html_text(html_file):
     soup = BeautifulSoup(html_file, 'html.parser')
-    text = soup.get_text()  # Extract visible text
-    return text
 def extract_txt_text(txt_file):
     text = txt_file.read().decode('utf-8')  # Read entire text file and decode
-    return text
 def process_file(file):
     extension = os.path.splitext(file.name)[1].lower()

 import streamlit as st
 def extract_pdf_text(pdf_file):
+    # Read all pages of the PDF file
     reader = PyPDF2.PdfReader(pdf_file)
     text = ''
     for page in reader.pages:
+        page_text = page.extract_text()
+        if page_text:  # Ensure there's text to append
+            text += page_text + '\n'  # Add a newline to separate pages
+    return text.strip()  # Remove trailing whitespace
 def extract_excel_text(excel_file):
     workbook = openpyxl.load_workbook(excel_file)
     sheet = workbook.active
     text = ''
     for row in sheet.iter_rows(values_only=True):
+        # Concatenate all cells in the row, ensuring no cells are skipped
         row_text = ' '.join([str(cell) for cell in row if cell is not None])
+        text += row_text + '\n'  # Newline for each row
+    return text.strip()  # Remove trailing whitespace
 def extract_html_text(html_file):
     soup = BeautifulSoup(html_file, 'html.parser')
+    text = soup.get_text(separator='\n')  # Use separator to maintain line breaks
+    return text.strip()  # Remove trailing whitespace
 def extract_txt_text(txt_file):
     text = txt_file.read().decode('utf-8')  # Read entire text file and decode
+    return text.strip()  # Remove trailing whitespace
 def process_file(file):
     extension = os.path.splitext(file.name)[1].lower()