acecalisto3 commited on
Commit
e9bc41c
·
verified ·
1 Parent(s): 86e8cee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -85
app.py CHANGED
@@ -3,11 +3,25 @@ import requests
3
  import os
4
  import urllib
5
  import base64
6
- import bs4
7
  import hashlib
8
  import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- EXCLUDED_FILES = ['app.py', 'requirements.txt', 'pre-requirements.txt', 'packages.txt', 'README.md', '.gitattributes', "backup.py", "Dockerfile"]
11
  URLS = {
12
  "Chordify - Play Along Chords": "https://chordify.net/",
13
  "National Guitar Academy - Guitar Learning": "https://www.guitaracademy.com/",
@@ -22,12 +36,24 @@ URLS = {
22
  "John Lennon": "https://www.ultimate-guitar.com/search.php?search_type=title&value=John%20Lennon",
23
  }
24
 
25
- if not os.path.exists("history.json"):
26
- with open("history.json", "w") as f:
27
- json.dump({}, f)
 
 
28
 
29
- def download_file(url, local_filename):
30
- if url.startswith('http://') or url.startswith('https://'):
 
 
 
 
 
 
 
 
 
 
31
  try:
32
  with requests.get(url, stream=True) as r:
33
  r.raise_for_status()
@@ -36,82 +62,185 @@ def download_file(url, local_filename):
36
  f.write(chunk)
37
  return local_filename
38
  except requests.exceptions.HTTPError as err:
39
- print(f"HTTP error occurred: {err}")
40
-
41
- def download_html_and_files(url, subdir):
42
- html_content = requests.get(url).text
43
- soup = BeautifulSoup(html_content, 'html.parser', 'lxml-xml')
44
- base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
45
 
46
- for link in soup.find_all('a'):
47
- file_url = urllib.parse.urljoin(base_url, link.get('href'))
48
- local_filename = os.path.join(subdir, urllib.parse.urlparse(file_url).path.split('/')[-1])
49
-
50
- if not local_filename.endswith('/') and local_filename != subdir:
51
- link['href'] = local_filename
52
- download_file(file_url, local_filename)
53
 
54
- with open(os.path.join(subdir, "index.html"), "w") as file:
55
- file.write(str(soup))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- def list_files(directory_path='.'):
58
- files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
 
 
59
  return [f for f in files if f not in EXCLUDED_FILES]
60
 
61
- def file_editor(file_path):
 
62
  st.write(f"Editing File: {os.path.basename(file_path)}")
63
- file_content = ""
64
-
65
- with open(file_path, "r") as f:
66
- file_content = f.read()
 
 
 
 
67
 
68
- file_content = st.text_area("Edit the file content:", value=file_content, height=250)
 
 
 
 
69
 
70
  if st.button("💾 Save"):
71
- with open(file_path, "w") as f:
72
- f.write(file_content)
73
- st.success(f"File '{os.path.basename(file_path)}' saved!")
 
 
 
 
74
 
75
- def show_file_operations(file_path, sequence_number):
 
76
  unique_key = hashlib.md5(file_path.encode()).hexdigest()
77
  file_content = ""
78
 
79
  col01, col02, col1, col2, col3 = st.columns(5)
 
80
  with col01:
81
  st.write(os.path.basename(file_path))
 
82
  with col1:
83
  edit_key = f"edit_{unique_key}_{sequence_number}"
84
- if st.button(f"✏️ Edit", key=edit_key):
85
- with open(file_path, "r") as f:
86
- file_content = f.read()
87
- text_area_key = f"text_area_{unique_key}_{sequence_number}"
88
- file_content = st.text_area("Edit the file content:", value=file_content, height=250, key=text_area_key)
 
 
 
 
 
 
 
 
 
89
 
90
  with col2:
91
  save_key = f"save_{unique_key}_{sequence_number}"
92
- if st.button(f"💾 Save", key=save_key ):
93
- if file_content: # Ensure file_content is not empty
94
- with open(file_path, "w") as f:
95
- f.write(file_content)
96
- st.success(f"File saved!")
 
 
 
 
97
 
98
  with col3:
99
  delete_key = f"delete_{unique_key}_{sequence_number}"
100
- if st.button(f"🗑️ Delete", key=delete_key):
101
- os.remove(file_path)
102
- st.markdown(f"File deleted!")
 
 
 
 
103
 
104
- file_sequence_numbers = {}
 
 
 
 
 
 
 
 
 
 
105
 
106
- def show_download_links(subdir):
 
107
  global file_sequence_numbers
 
 
 
 
108
  for file in list_files(subdir):
109
  file_path = os.path.join(subdir, file)
110
- if file_path not in file_sequence_numbers:
111
- file_sequence_numbers[file_path] = 1
 
112
  else:
113
- file_sequence_numbers[file_path] += 1
114
- sequence_number = file_sequence_numbers[file_path]
 
115
 
116
  if os.path.isfile(file_path):
117
  st.markdown(get_download_link(file_path), unsafe_allow_html=True)
@@ -119,66 +248,79 @@ def show_download_links(subdir):
119
  else:
120
  st.write(f"File not found: {file}")
121
 
122
- def get_download_link(file):
123
- with open(file, "rb") as f:
124
- bytes = f.read()
125
- b64 = base64.b64encode(bytes).decode()
126
- href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{os.path.basename(file)}\'>Download: {os.path.basename(file)}</a>'
127
- return href
128
-
129
- def main():
130
  st.sidebar.title('Web Datasets Bulk Downloader')
131
 
 
 
 
 
132
  query_params = st.experimental_get_query_params()
133
  file_to_edit = query_params.get('file_to_edit', [None])[0]
134
 
135
  if file_to_edit and os.path.exists(file_to_edit):
136
  file_editor(file_to_edit)
137
  else:
138
- url_input_method = st.sidebar.radio("Choose URL Input Method", ["Enter URL", "Select from List"])
 
 
 
 
 
139
  url = ""
140
  if url_input_method == "Enter URL":
141
- url = st.sidebar.text_input('Please enter a Web URL to bulk download text and files')
 
 
142
  else:
143
- selected_site = st.sidebar.selectbox("Select a Website", list(URLS.keys()))
 
 
 
144
  url = URLS[selected_site]
145
 
146
- if not os.path.exists("history.json"):
147
- with open("history.json", "w") as f:
148
- json.dump({}, f)
149
-
150
- with open("history.json", "r") as f:
151
- try:
152
  history = json.load(f)
153
- print("History loaded:", history) # Debugging line
154
- except Exception as e:
155
- print('Error loading history:', e)
156
 
 
157
  if url:
158
  subdir = hashlib.md5(url.encode()).hexdigest()
159
- if not os.path.exists(subdir):
160
- os.makedirs(subdir)
161
  if url not in history:
162
  history[url] = subdir
163
- with open("history.json", "w") as f:
164
- json.dump(history, f)
 
 
 
165
 
 
166
  if st.sidebar.button('📥 Get All the Content'):
167
- download_html_and_files(url, history[url])
168
- show_download_links(history[url])
 
 
 
169
 
 
170
  if st.sidebar.button('📂 Show Download Links'):
171
  for subdir in history.values():
172
  show_download_links(subdir)
173
 
 
174
  with st.expander("URL History and Downloaded Files"):
175
- try:
176
- for url, subdir in history.items():
177
- st.markdown(f"#### {url}")
178
- show_download_links(subdir)
179
- except Exception as e:
180
- print('Error displaying history:', e)
181
 
 
182
  for subdir in history.values():
183
  show_download_links(subdir)
184
 
 
3
  import os
4
  import urllib
5
  import base64
6
+ from bs4 import BeautifulSoup
7
  import hashlib
8
  import json
9
+ import uuid
10
+ import logging
11
+ from typing import Optional, Dict, List, Any
12
+ from pathlib import Path
13
+
14
+ # Set up logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Constants
19
+ EXCLUDED_FILES = [
20
+ 'app.py', 'requirements.txt', 'pre-requirements.txt',
21
+ 'packages.txt', 'README.md', '.gitattributes',
22
+ "backup.py", "Dockerfile"
23
+ ]
24
 
 
25
  URLS = {
26
  "Chordify - Play Along Chords": "https://chordify.net/",
27
  "National Guitar Academy - Guitar Learning": "https://www.guitaracademy.com/",
 
36
  "John Lennon": "https://www.ultimate-guitar.com/search.php?search_type=title&value=John%20Lennon",
37
  }
38
 
39
+ def initialize_history() -> None:
40
+ """Initialize history.json if it doesn't exist."""
41
+ if not os.path.exists("history.json"):
42
+ with open("history.json", "w") as f:
43
+ json.dump({}, f)
44
 
45
+ def download_file(url: str, local_filename: str) -> Optional[str]:
46
+ """
47
+ Download a file from a URL to a local file.
48
+
49
+ Args:
50
+ url (str): The URL to download from
51
+ local_filename (str): The local file path to save to
52
+
53
+ Returns:
54
+ Optional[str]: The local filename if successful, None otherwise
55
+ """
56
+ if url.startswith(('http://', 'https://')):
57
  try:
58
  with requests.get(url, stream=True) as r:
59
  r.raise_for_status()
 
62
  f.write(chunk)
63
  return local_filename
64
  except requests.exceptions.HTTPError as err:
65
+ logger.error(f"HTTP error occurred: {err}")
66
+ return None
67
+ return None
 
 
 
68
 
69
+ def download_html_and_files(url: str, subdir: str) -> None:
70
+ """
71
+ Download HTML content and associated files from a URL.
 
 
 
 
72
 
73
+ Args:
74
+ url (str): The URL to download content from
75
+ subdir (str): The subdirectory to save files to
76
+ """
77
+ try:
78
+ os.makedirs(subdir, exist_ok=True)
79
+
80
+ response = requests.get(url, timeout=30)
81
+ response.raise_for_status()
82
+ html_content = response.text
83
+
84
+ try:
85
+ soup = BeautifulSoup(html_content, 'html.parser')
86
+ except Exception as e:
87
+ logger.error(f"Failed to parse HTML content: {e}")
88
+ st.error(f"Failed to parse HTML content from {url}")
89
+ return
90
+
91
+ base_url = urllib.parse.urlunparse(
92
+ urllib.parse.urlparse(url)._replace(
93
+ path='', params='', query='', fragment=''
94
+ )
95
+ )
96
+
97
+ for link in soup.find_all('a'):
98
+ href = link.get('href')
99
+ if not href:
100
+ continue
101
+
102
+ try:
103
+ file_url = urllib.parse.urljoin(base_url, href)
104
+ local_filename = os.path.join(
105
+ subdir,
106
+ urllib.parse.urlparse(file_url).path.split('/')[-1]
107
+ )
108
+
109
+ if not local_filename or local_filename.endswith('/'):
110
+ continue
111
+
112
+ if local_filename != subdir:
113
+ link['href'] = local_filename
114
+ download_file(file_url, local_filename)
115
+
116
+ except Exception as e:
117
+ logger.error(f"Failed to process link {href}: {e}")
118
+ continue
119
+
120
+ try:
121
+ with open(os.path.join(subdir, "index.html"), "w", encoding='utf-8') as file:
122
+ file.write(str(soup))
123
+ except Exception as e:
124
+ logger.error(f"Failed to save HTML file: {e}")
125
+ st.error("Failed to save downloaded content")
126
+
127
+ except requests.exceptions.RequestException as e:
128
+ logger.error(f"Failed to download content from {url}: {e}")
129
+ st.error(f"Failed to download content from {url}")
130
+ except Exception as e:
131
+ logger.error(f"Unexpected error while downloading content: {e}")
132
+ st.error("An unexpected error occurred while downloading content")
133
 
134
+ def list_files(directory_path: str = '.') -> List[str]:
135
+ """List all files in directory excluding EXCLUDED_FILES."""
136
+ files = [f for f in os.listdir(directory_path)
137
+ if os.path.isfile(os.path.join(directory_path, f))]
138
  return [f for f in files if f not in EXCLUDED_FILES]
139
 
140
+ def file_editor(file_path: str) -> None:
141
+ """Edit file content using Streamlit text area."""
142
  st.write(f"Editing File: {os.path.basename(file_path)}")
143
+
144
+ try:
145
+ with open(file_path, "r", encoding='utf-8') as f:
146
+ file_content = f.read()
147
+ except Exception as e:
148
+ logger.error(f"Failed to read file {file_path}: {e}")
149
+ st.error("Failed to read file")
150
+ return
151
 
152
+ edited_content = st.text_area(
153
+ "Edit the file content:",
154
+ value=file_content,
155
+ height=250
156
+ )
157
 
158
  if st.button("💾 Save"):
159
+ try:
160
+ with open(file_path, "w", encoding='utf-8') as f:
161
+ f.write(edited_content)
162
+ st.success(f"File '{os.path.basename(file_path)}' saved!")
163
+ except Exception as e:
164
+ logger.error(f"Failed to save file {file_path}: {e}")
165
+ st.error("Failed to save file")
166
 
167
+ def show_file_operations(file_path: str, sequence_number: int) -> None:
168
+ """Show file operations UI for a given file."""
169
  unique_key = hashlib.md5(file_path.encode()).hexdigest()
170
  file_content = ""
171
 
172
  col01, col02, col1, col2, col3 = st.columns(5)
173
+
174
  with col01:
175
  st.write(os.path.basename(file_path))
176
+
177
  with col1:
178
  edit_key = f"edit_{unique_key}_{sequence_number}"
179
+ if st.button("✏️ Edit", key=edit_key):
180
+ try:
181
+ with open(file_path, "r", encoding='utf-8') as f:
182
+ file_content = f.read()
183
+ text_area_key = f"text_area_{unique_key}_{sequence_number}"
184
+ file_content = st.text_area(
185
+ "Edit the file content:",
186
+ value=file_content,
187
+ height=250,
188
+ key=text_area_key
189
+ )
190
+ except Exception as e:
191
+ logger.error(f"Failed to read file {file_path}: {e}")
192
+ st.error("Failed to read file")
193
 
194
  with col2:
195
  save_key = f"save_{unique_key}_{sequence_number}"
196
+ if st.button("💾 Save", key=save_key):
197
+ if file_content:
198
+ try:
199
+ with open(file_path, "w", encoding='utf-8') as f:
200
+ f.write(file_content)
201
+ st.success("File saved!")
202
+ except Exception as e:
203
+ logger.error(f"Failed to save file {file_path}: {e}")
204
+ st.error("Failed to save file")
205
 
206
  with col3:
207
  delete_key = f"delete_{unique_key}_{sequence_number}"
208
+ if st.button("🗑️ Delete", key=delete_key):
209
+ try:
210
+ os.remove(file_path)
211
+ st.success("File deleted!")
212
+ except Exception as e:
213
+ logger.error(f"Failed to delete file {file_path}: {e}")
214
+ st.error("Failed to delete file")
215
 
216
+ def get_download_link(file: str) -> str:
217
+ """Generate a download link for a file."""
218
+ try:
219
+ with open(file, "rb") as f:
220
+ bytes_content = f.read()
221
+ b64 = base64.b64encode(bytes_content).decode()
222
+ filename = os.path.basename(file)
223
+ return f'<a href="data:file/octet-stream;base64,{b64}" download=\'{filename}\'>Download: {filename}</a>'
224
+ except Exception as e:
225
+ logger.error(f"Failed to create download link for {file}: {e}")
226
+ return f"Failed to create download link for {os.path.basename(file)}"
227
 
228
+ def show_download_links(subdir: str) -> None:
229
+ """Show download links for all files in a directory."""
230
  global file_sequence_numbers
231
+
232
+ if not hasattr(show_download_links, 'file_sequence_numbers'):
233
+ show_download_links.file_sequence_numbers = {}
234
+
235
  for file in list_files(subdir):
236
  file_path = os.path.join(subdir, file)
237
+
238
+ if file_path not in show_download_links.file_sequence_numbers:
239
+ show_download_links.file_sequence_numbers[file_path] = 1
240
  else:
241
+ show_download_links.file_sequence_numbers[file_path] += 1
242
+
243
+ sequence_number = show_download_links.file_sequence_numbers[file_path]
244
 
245
  if os.path.isfile(file_path):
246
  st.markdown(get_download_link(file_path), unsafe_allow_html=True)
 
248
  else:
249
  st.write(f"File not found: {file}")
250
 
251
+ def main() -> None:
252
+ """Main application function."""
 
 
 
 
 
 
253
  st.sidebar.title('Web Datasets Bulk Downloader')
254
 
255
+ # Initialize history file
256
+ initialize_history()
257
+
258
+ # Check for query parameters
259
  query_params = st.experimental_get_query_params()
260
  file_to_edit = query_params.get('file_to_edit', [None])[0]
261
 
262
  if file_to_edit and os.path.exists(file_to_edit):
263
  file_editor(file_to_edit)
264
  else:
265
+ # URL input method selection
266
+ url_input_method = st.sidebar.radio(
267
+ "Choose URL Input Method",
268
+ ["Enter URL", "Select from List"]
269
+ )
270
+
271
  url = ""
272
  if url_input_method == "Enter URL":
273
+ url = st.sidebar.text_input(
274
+ 'Please enter a Web URL to bulk download text and files'
275
+ )
276
  else:
277
+ selected_site = st.sidebar.selectbox(
278
+ "Select a Website",
279
+ list(URLS.keys())
280
+ )
281
  url = URLS[selected_site]
282
 
283
+ # Load history
284
+ try:
285
+ with open("history.json", "r") as f:
 
 
 
286
  history = json.load(f)
287
+ except Exception as e:
288
+ logger.error(f"Failed to load history: {e}")
289
+ history = {}
290
 
291
+ # Handle URL submission
292
  if url:
293
  subdir = hashlib.md5(url.encode()).hexdigest()
294
+ os.makedirs(subdir, exist_ok=True)
295
+
296
  if url not in history:
297
  history[url] = subdir
298
+ try:
299
+ with open("history.json", "w") as f:
300
+ json.dump(history, f)
301
+ except Exception as e:
302
+ logger.error(f"Failed to save history: {e}")
303
 
304
+ # Download content button
305
  if st.sidebar.button('📥 Get All the Content'):
306
+ if url:
307
+ download_html_and_files(url, history[url])
308
+ show_download_links(history[url])
309
+ else:
310
+ st.warning("Please enter or select a URL first")
311
 
312
+ # Show download links button
313
  if st.sidebar.button('📂 Show Download Links'):
314
  for subdir in history.values():
315
  show_download_links(subdir)
316
 
317
+ # URL history expander
318
  with st.expander("URL History and Downloaded Files"):
319
+ for url, subdir in history.items():
320
+ st.markdown(f"#### {url}")
321
+ show_download_links(subdir)
 
 
 
322
 
323
+ # Update current files
324
  for subdir in history.values():
325
  show_download_links(subdir)
326