File size: 2,721 Bytes
7885ecb 92cb1d3 7885ecb edeb1a2 4287bf3 edeb1a2 7885ecb edeb1a2 7885ecb edeb1a2 7885ecb 4287bf3 edeb1a2 92cb1d3 4287bf3 edeb1a2 7885ecb 65ed1d0 92cb1d3 7885ecb 65ed1d0 92cb1d3 65ed1d0 92cb1d3 65ed1d0 92cb1d3 7885ecb 92cb1d3 65ed1d0 92cb1d3 65ed1d0 92cb1d3 e04d89a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | import PyPDF2
import openpyxl
from bs4 import BeautifulSoup
import os
import streamlit as st
def extract_pdf_text(pdf_file):
# Read all pages of the PDF file
reader = PyPDF2.PdfReader(pdf_file)
text = ''
for page in reader.pages:
page_text = page.extract_text()
if page_text: # Ensure there's text to append
text += page_text + '\n' # Add a newline to separate pages
return text.strip() # Remove trailing whitespace
def extract_excel_text(excel_file):
workbook = openpyxl.load_workbook(excel_file)
sheet = workbook.active
text = ''
for row in sheet.iter_rows(values_only=True):
# Concatenate all cells in the row, ensuring no cells are skipped
row_text = ' '.join([str(cell) for cell in row if cell is not None])
text += row_text + '\n' # Newline for each row
return text.strip() # Remove trailing whitespace
def extract_html_text(html_file):
soup = BeautifulSoup(html_file, 'html.parser')
text = soup.get_text(separator='\n') # Use separator to maintain line breaks
return text.strip() # Remove trailing whitespace
def extract_txt_text(txt_file):
text = txt_file.read().decode('utf-8') # Read entire text file and decode
return text.strip() # Remove trailing whitespace
def process_file(file):
extension = os.path.splitext(file.name)[1].lower()
if extension == '.pdf':
return extract_pdf_text(file)
elif extension in ['.xlsx', '.xls']:
return extract_excel_text(file)
elif extension in ['.html', '.htm']:
return extract_html_text(file)
elif extension == '.txt':
return extract_txt_text(file)
else:
return "Unsupported file format."
# Streamlit application
st.title("File Content Extractor")
uploaded_file = st.file_uploader("Choose a file", type=['pdf', 'xlsx', 'xls', 'html', 'htm', 'txt'])
if uploaded_file is not None:
# Process the uploaded file
content = process_file(uploaded_file)
st.subheader("Extracted Content:")
st.text(content) # Display extracted content
# Search functionality
search_query = st.text_input("Enter text to search for:")
if search_query:
# Search for the query in the extracted content
search_results = []
lines = content.split('\n')
for line in lines:
if search_query.lower() in line.lower(): # Case-insensitive search
search_results.append(line)
if search_results:
st.subheader("Search Results:")
for result in search_results:
st.text(result)
else:
st.subheader("Search Results:")
st.text("No matching content found.")
|