File size: 2,721 Bytes
7885ecb
 
 
 
92cb1d3
7885ecb
 
edeb1a2
4287bf3
 
 
edeb1a2
 
 
 
7885ecb
 
 
 
 
 
edeb1a2
7885ecb
edeb1a2
 
7885ecb
 
4287bf3
edeb1a2
 
92cb1d3
 
4287bf3
edeb1a2
7885ecb
65ed1d0
 
92cb1d3
7885ecb
65ed1d0
92cb1d3
65ed1d0
92cb1d3
65ed1d0
92cb1d3
 
7885ecb
 
 
92cb1d3
65ed1d0
 
92cb1d3
65ed1d0
 
92cb1d3
 
 
 
e04d89a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import PyPDF2
import openpyxl
from bs4 import BeautifulSoup
import os
import streamlit as st

def extract_pdf_text(pdf_file):
    # Read all pages of the PDF file
    reader = PyPDF2.PdfReader(pdf_file)
    text = ''
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:  # Ensure there's text to append
            text += page_text + '\n'  # Add a newline to separate pages
    return text.strip()  # Remove trailing whitespace

def extract_excel_text(excel_file):
    workbook = openpyxl.load_workbook(excel_file)
    sheet = workbook.active
    text = ''
    for row in sheet.iter_rows(values_only=True):
        # Concatenate all cells in the row, ensuring no cells are skipped
        row_text = ' '.join([str(cell) for cell in row if cell is not None])
        text += row_text + '\n'  # Newline for each row
    return text.strip()  # Remove trailing whitespace

def extract_html_text(html_file):
    soup = BeautifulSoup(html_file, 'html.parser')
    text = soup.get_text(separator='\n')  # Use separator to maintain line breaks
    return text.strip()  # Remove trailing whitespace

def extract_txt_text(txt_file):
    text = txt_file.read().decode('utf-8')  # Read entire text file and decode
    return text.strip()  # Remove trailing whitespace

def process_file(file):
    extension = os.path.splitext(file.name)[1].lower()

    if extension == '.pdf':
        return extract_pdf_text(file)
    elif extension in ['.xlsx', '.xls']:
        return extract_excel_text(file)
    elif extension in ['.html', '.htm']:
        return extract_html_text(file)
    elif extension == '.txt':
        return extract_txt_text(file)
    else:
        return "Unsupported file format."

# Streamlit application
st.title("File Content Extractor")

uploaded_file = st.file_uploader("Choose a file", type=['pdf', 'xlsx', 'xls', 'html', 'htm', 'txt'])

if uploaded_file is not None:
    # Process the uploaded file
    content = process_file(uploaded_file)
    st.subheader("Extracted Content:")
    st.text(content)  # Display extracted content

    # Search functionality
    search_query = st.text_input("Enter text to search for:")
    
    if search_query:
        # Search for the query in the extracted content
        search_results = []
        lines = content.split('\n')
        for line in lines:
            if search_query.lower() in line.lower():  # Case-insensitive search
                search_results.append(line)

        if search_results:
            st.subheader("Search Results:")
            for result in search_results:
                st.text(result)
        else:
            st.subheader("Search Results:")
            st.text("No matching content found.")