v0.1
Browse files
app.py
CHANGED
|
@@ -39,10 +39,6 @@ def delete_pdf(pdf_file_name):
|
|
| 39 |
# Parse PDF
|
| 40 |
def parse_pdf(pdf_file_name, page_numbers, search_strings, rect_coords):
|
| 41 |
doc = fitz.open(os.path.join(PDF_STORAGE_DIR, pdf_file_name))
|
| 42 |
-
if not page_numbers:
|
| 43 |
-
page_numbers = [0]
|
| 44 |
-
else:
|
| 45 |
-
page_numbers = [int(i) for i in page_numbers.split(',')]
|
| 46 |
pages = [doc.load_page(i) for i in page_numbers]
|
| 47 |
output = ""
|
| 48 |
|
|
@@ -51,20 +47,18 @@ def parse_pdf(pdf_file_name, page_numbers, search_strings, rect_coords):
|
|
| 51 |
for page in pages:
|
| 52 |
output += "### Page %d\n" % page.number
|
| 53 |
output += page.get_text() + "\n\n"
|
| 54 |
-
|
| 55 |
-
# Search strings
|
| 56 |
-
output += "\n### Search Results:\n"
|
| 57 |
-
for search_string in search_strings:
|
| 58 |
-
for page in pages:
|
| 59 |
-
rect = page.search_for(search_string)
|
| 60 |
-
if rect:
|
| 61 |
-
output += f"{search_string} - {rect}\n"
|
| 62 |
-
else:
|
| 63 |
-
output += f"{search_string} not found\n"
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
if rect_coords:
|
| 69 |
x1, y1, x2, y2 = map(float, rect_coords.split(','))
|
| 70 |
rect = fitz.Rect(x1, y1, x2, y2)
|
|
@@ -73,49 +67,73 @@ def parse_pdf(pdf_file_name, page_numbers, search_strings, rect_coords):
|
|
| 73 |
output += page.get_text("text", clip=rect) + "\n\n"
|
| 74 |
return output
|
| 75 |
|
| 76 |
-
# Main app
|
| 77 |
def main():
|
| 78 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
st.write("
|
| 83 |
-
col1, col2 = st.columns(2)
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
pdf_file_name = download_pdf(url)
|
| 90 |
-
st.success(f"PDF downloaded: {pdf_file_name}")
|
| 91 |
-
# Delete PDF
|
| 92 |
-
with col2:
|
| 93 |
pdf_files = os.listdir(PDF_STORAGE_DIR)
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
-
with st.container():
|
| 100 |
-
st.write("# Upload PDF")
|
| 101 |
-
# Upload PDF
|
| 102 |
-
pdf_file = st.file_uploader("Select PDF file")
|
| 103 |
-
if pdf_file:
|
| 104 |
-
pdf_file_name = upload_pdf(pdf_file)
|
| 105 |
-
st.success(f"PDF uploaded: {pdf_file_name}")
|
| 106 |
-
|
| 107 |
-
# Container for parsing
|
| 108 |
-
with st.container():
|
| 109 |
-
st.write("#### Parse PDF")
|
| 110 |
-
pdf_files = os.listdir(PDF_STORAGE_DIR)
|
| 111 |
-
pdf_file_name = st.selectbox("Select PDF file to parse", pdf_files)
|
| 112 |
-
page_numbers = st.text_input("Enter page numbers (comma-separated)", value="")
|
| 113 |
-
search_strings = st.text_input("Enter search strings (comma-separated)")
|
| 114 |
-
rect_coords = st.text_input("Enter rectangle coordinates (x1,y1,x2,y2)")
|
| 115 |
-
if st.button("Parse PDF"):
|
| 116 |
-
search_strings = [s.strip() for s in search_strings.split(',')]
|
| 117 |
-
output = parse_pdf(pdf_file_name, page_numbers, search_strings, rect_coords)
|
| 118 |
-
st.write(output)
|
| 119 |
|
| 120 |
if __name__ == "__main__":
|
| 121 |
main()
|
|
|
|
| 39 |
# Parse PDF
|
| 40 |
def parse_pdf(pdf_file_name, page_numbers, search_strings, rect_coords):
|
| 41 |
doc = fitz.open(os.path.join(PDF_STORAGE_DIR, pdf_file_name))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
pages = [doc.load_page(i) for i in page_numbers]
|
| 43 |
output = ""
|
| 44 |
|
|
|
|
| 47 |
for page in pages:
|
| 48 |
output += "### Page %d\n" % page.number
|
| 49 |
output += page.get_text() + "\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
+
if search_strings:
|
| 52 |
+
output += "\n### Search Results:\n"
|
| 53 |
+
for search_string in search_strings:
|
| 54 |
+
for page in pages:
|
| 55 |
+
rect = page.search_for(search_string)
|
| 56 |
+
if rect:
|
| 57 |
+
output += f"{search_string} - {rect}\n"
|
| 58 |
+
else:
|
| 59 |
+
output += f"{search_string} not found\n"
|
| 60 |
+
output += "\n"
|
| 61 |
+
|
| 62 |
if rect_coords:
|
| 63 |
x1, y1, x2, y2 = map(float, rect_coords.split(','))
|
| 64 |
rect = fitz.Rect(x1, y1, x2, y2)
|
|
|
|
| 67 |
output += page.get_text("text", clip=rect) + "\n\n"
|
| 68 |
return output
|
| 69 |
|
|
|
|
| 70 |
def main():
|
| 71 |
+
st.title("PDF Manager")
|
| 72 |
+
|
| 73 |
+
# Simplified navigation
|
| 74 |
+
tabs = ["Home", "Extract Text", "Manage PDFs"]
|
| 75 |
+
tab = st.sidebar.selectbox("Navigation", tabs)
|
| 76 |
|
| 77 |
+
if tab == "Home":
|
| 78 |
+
st.write("### Welcome to PDF Manager!")
|
| 79 |
+
st.write("This app helps you extract text from PDF files.")
|
|
|
|
| 80 |
|
| 81 |
+
elif tab == "Extract Text":
|
| 82 |
+
st.write("### Extract Text from PDF")
|
| 83 |
+
|
| 84 |
+
with st.form("extract_text_form"):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
pdf_files = os.listdir(PDF_STORAGE_DIR)
|
| 86 |
+
|
| 87 |
+
pdf_file = st.selectbox("Select PDF file to delete", pdf_files)
|
| 88 |
+
page_numbers = st.text_input("Enter page numbers (comma-separated)")
|
| 89 |
+
search_strings = st.text_input("Enter search strings (comma-separated)")
|
| 90 |
+
rect_coords = st.text_input("Enter rectangle coordinates (x1,y1,x2,y2)")
|
| 91 |
+
submit_button = st.form_submit_button("Extract Text")
|
| 92 |
+
if submit_button:
|
| 93 |
+
if not page_numbers:
|
| 94 |
+
page_numbers = [0]
|
| 95 |
+
else:
|
| 96 |
+
page_numbers = [int(i) for i in page_numbers.split(',')]
|
| 97 |
+
|
| 98 |
+
if not search_strings:
|
| 99 |
+
search_strings = []
|
| 100 |
+
else:
|
| 101 |
+
search_strings = [s.strip() for s in search_strings.split(',')]
|
| 102 |
+
|
| 103 |
+
output = parse_pdf(pdf_file, page_numbers, search_strings, rect_coords)
|
| 104 |
+
st.write(output)
|
| 105 |
+
|
| 106 |
+
elif tab == "Manage PDFs":
|
| 107 |
+
st.write("### Manage PDF Files")
|
| 108 |
+
|
| 109 |
+
with st.container():
|
| 110 |
+
col1, col2 = st.columns(2)
|
| 111 |
+
|
| 112 |
+
# Download PDF
|
| 113 |
+
with col1:
|
| 114 |
+
url = st.text_input("Enter PDF URL")
|
| 115 |
+
if st.button("Download PDF"):
|
| 116 |
+
if not url:
|
| 117 |
+
st.error("Please enter a URL")
|
| 118 |
+
else:
|
| 119 |
+
pdf_file_name = download_pdf(url)
|
| 120 |
+
st.success(f"PDF downloaded: {pdf_file_name}")
|
| 121 |
+
# Delete PDF
|
| 122 |
+
with col2:
|
| 123 |
+
pdf_files = os.listdir(PDF_STORAGE_DIR)
|
| 124 |
+
pdf_file_name = st.selectbox("Select PDF file to delete", pdf_files)
|
| 125 |
+
if st.button("Delete PDF"):
|
| 126 |
+
output = delete_pdf(pdf_file_name)
|
| 127 |
+
st.success(output)
|
| 128 |
+
|
| 129 |
+
with st.container():
|
| 130 |
+
st.write("# Upload PDF")
|
| 131 |
+
# Upload PDF
|
| 132 |
+
pdf_file = st.file_uploader("Select PDF file")
|
| 133 |
+
if pdf_file:
|
| 134 |
+
pdf_file_name = upload_pdf(pdf_file)
|
| 135 |
+
st.success(f"PDF uploaded: {pdf_file_name}")
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
if __name__ == "__main__":
|
| 139 |
main()
|