venkatl commited on
Commit
aa3e00e
·
1 Parent(s): e685745
Files changed (1) hide show
  1. app.py +73 -55
app.py CHANGED
@@ -39,10 +39,6 @@ def delete_pdf(pdf_file_name):
39
  # Parse PDF
40
  def parse_pdf(pdf_file_name, page_numbers, search_strings, rect_coords):
41
  doc = fitz.open(os.path.join(PDF_STORAGE_DIR, pdf_file_name))
42
- if not page_numbers:
43
- page_numbers = [0]
44
- else:
45
- page_numbers = [int(i) for i in page_numbers.split(',')]
46
  pages = [doc.load_page(i) for i in page_numbers]
47
  output = ""
48
 
@@ -51,20 +47,18 @@ def parse_pdf(pdf_file_name, page_numbers, search_strings, rect_coords):
51
  for page in pages:
52
  output += "### Page %d\n" % page.number
53
  output += page.get_text() + "\n\n"
54
-
55
- # Search strings
56
- output += "\n### Search Results:\n"
57
- for search_string in search_strings:
58
- for page in pages:
59
- rect = page.search_for(search_string)
60
- if rect:
61
- output += f"{search_string} - {rect}\n"
62
- else:
63
- output += f"{search_string} not found\n"
64
 
65
- output += "\n"
66
-
67
- # Rectangle coordinates
 
 
 
 
 
 
 
 
68
  if rect_coords:
69
  x1, y1, x2, y2 = map(float, rect_coords.split(','))
70
  rect = fitz.Rect(x1, y1, x2, y2)
@@ -73,49 +67,73 @@ def parse_pdf(pdf_file_name, page_numbers, search_strings, rect_coords):
73
  output += page.get_text("text", clip=rect) + "\n\n"
74
  return output
75
 
76
- # Main app
77
  def main():
78
- st.write("### PDF Manager")
 
 
 
 
79
 
80
- # Container for download/upload/delete
81
- with st.container():
82
- st.write("#### Manage PDF Files")
83
- col1, col2 = st.columns(2)
84
 
85
- # Download PDF
86
- with col1:
87
- url = st.text_input("Enter PDF URL")
88
- if st.button("Download PDF"):
89
- pdf_file_name = download_pdf(url)
90
- st.success(f"PDF downloaded: {pdf_file_name}")
91
- # Delete PDF
92
- with col2:
93
  pdf_files = os.listdir(PDF_STORAGE_DIR)
94
- pdf_file_name = st.selectbox("Select PDF file to delete", pdf_files)
95
- if st.button("Delete PDF"):
96
- output = delete_pdf(pdf_file_name)
97
- st.success(output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
- with st.container():
100
- st.write("# Upload PDF")
101
- # Upload PDF
102
- pdf_file = st.file_uploader("Select PDF file")
103
- if pdf_file:
104
- pdf_file_name = upload_pdf(pdf_file)
105
- st.success(f"PDF uploaded: {pdf_file_name}")
106
-
107
- # Container for parsing
108
- with st.container():
109
- st.write("#### Parse PDF")
110
- pdf_files = os.listdir(PDF_STORAGE_DIR)
111
- pdf_file_name = st.selectbox("Select PDF file to parse", pdf_files)
112
- page_numbers = st.text_input("Enter page numbers (comma-separated)", value="")
113
- search_strings = st.text_input("Enter search strings (comma-separated)")
114
- rect_coords = st.text_input("Enter rectangle coordinates (x1,y1,x2,y2)")
115
- if st.button("Parse PDF"):
116
- search_strings = [s.strip() for s in search_strings.split(',')]
117
- output = parse_pdf(pdf_file_name, page_numbers, search_strings, rect_coords)
118
- st.write(output)
119
 
120
  if __name__ == "__main__":
121
  main()
 
39
  # Parse PDF
40
  def parse_pdf(pdf_file_name, page_numbers, search_strings, rect_coords):
41
  doc = fitz.open(os.path.join(PDF_STORAGE_DIR, pdf_file_name))
 
 
 
 
42
  pages = [doc.load_page(i) for i in page_numbers]
43
  output = ""
44
 
 
47
  for page in pages:
48
  output += "### Page %d\n" % page.number
49
  output += page.get_text() + "\n\n"
 
 
 
 
 
 
 
 
 
 
50
 
51
+ if search_strings:
52
+ output += "\n### Search Results:\n"
53
+ for search_string in search_strings:
54
+ for page in pages:
55
+ rect = page.search_for(search_string)
56
+ if rect:
57
+ output += f"{search_string} - {rect}\n"
58
+ else:
59
+ output += f"{search_string} not found\n"
60
+ output += "\n"
61
+
62
  if rect_coords:
63
  x1, y1, x2, y2 = map(float, rect_coords.split(','))
64
  rect = fitz.Rect(x1, y1, x2, y2)
 
67
  output += page.get_text("text", clip=rect) + "\n\n"
68
  return output
69
 
 
70
  def main():
71
+ st.title("PDF Manager")
72
+
73
+ # Simplified navigation
74
+ tabs = ["Home", "Extract Text", "Manage PDFs"]
75
+ tab = st.sidebar.selectbox("Navigation", tabs)
76
 
77
+ if tab == "Home":
78
+ st.write("### Welcome to PDF Manager!")
79
+ st.write("This app helps you extract text from PDF files.")
 
80
 
81
+ elif tab == "Extract Text":
82
+ st.write("### Extract Text from PDF")
83
+
84
+ with st.form("extract_text_form"):
 
 
 
 
85
  pdf_files = os.listdir(PDF_STORAGE_DIR)
86
+
87
+ pdf_file = st.selectbox("Select PDF file to delete", pdf_files)
88
+ page_numbers = st.text_input("Enter page numbers (comma-separated)")
89
+ search_strings = st.text_input("Enter search strings (comma-separated)")
90
+ rect_coords = st.text_input("Enter rectangle coordinates (x1,y1,x2,y2)")
91
+ submit_button = st.form_submit_button("Extract Text")
92
+ if submit_button:
93
+ if not page_numbers:
94
+ page_numbers = [0]
95
+ else:
96
+ page_numbers = [int(i) for i in page_numbers.split(',')]
97
+
98
+ if not search_strings:
99
+ search_strings = []
100
+ else:
101
+ search_strings = [s.strip() for s in search_strings.split(',')]
102
+
103
+ output = parse_pdf(pdf_file, page_numbers, search_strings, rect_coords)
104
+ st.write(output)
105
+
106
+ elif tab == "Manage PDFs":
107
+ st.write("### Manage PDF Files")
108
+
109
+ with st.container():
110
+ col1, col2 = st.columns(2)
111
+
112
+ # Download PDF
113
+ with col1:
114
+ url = st.text_input("Enter PDF URL")
115
+ if st.button("Download PDF"):
116
+ if not url:
117
+ st.error("Please enter a URL")
118
+ else:
119
+ pdf_file_name = download_pdf(url)
120
+ st.success(f"PDF downloaded: {pdf_file_name}")
121
+ # Delete PDF
122
+ with col2:
123
+ pdf_files = os.listdir(PDF_STORAGE_DIR)
124
+ pdf_file_name = st.selectbox("Select PDF file to delete", pdf_files)
125
+ if st.button("Delete PDF"):
126
+ output = delete_pdf(pdf_file_name)
127
+ st.success(output)
128
+
129
+ with st.container():
130
+ st.write("# Upload PDF")
131
+ # Upload PDF
132
+ pdf_file = st.file_uploader("Select PDF file")
133
+ if pdf_file:
134
+ pdf_file_name = upload_pdf(pdf_file)
135
+ st.success(f"PDF uploaded: {pdf_file_name}")
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  if __name__ == "__main__":
139
  main()