DataSetGo

Sleeping

App Files Files Community

acecalisto3 commited on Oct 30, 2024

Commit

e9bc41c

verified ·

1 Parent(s): 86e8cee

Update app.py

Browse files

Files changed (1) hide show

app.py +227 -85

app.py CHANGED Viewed

@@ -3,11 +3,25 @@ import requests
 import os
 import urllib
 import base64
-import bs4
 import hashlib
 import json
-EXCLUDED_FILES = ['app.py', 'requirements.txt', 'pre-requirements.txt', 'packages.txt', 'README.md', '.gitattributes', "backup.py", "Dockerfile"]
 URLS = {
     "Chordify - Play Along Chords": "https://chordify.net/",
     "National Guitar Academy - Guitar Learning": "https://www.guitaracademy.com/",
@@ -22,12 +36,24 @@ URLS = {
     "John Lennon": "https://www.ultimate-guitar.com/search.php?search_type=title&value=John%20Lennon",
 }
-if not os.path.exists("history.json"):
-    with open("history.json", "w") as f:
-        json.dump({}, f)
-def download_file(url, local_filename):
-    if url.startswith('http://') or url.startswith('https://'):
         try:
             with requests.get(url, stream=True) as r:
                 r.raise_for_status()
@@ -36,82 +62,185 @@ def download_file(url, local_filename):
                         f.write(chunk)
             return local_filename
         except requests.exceptions.HTTPError as err:
-            print(f"HTTP error occurred: {err}")
-def download_html_and_files(url, subdir):
-    html_content = requests.get(url).text
-    soup = BeautifulSoup(html_content, 'html.parser', 'lxml-xml')
-    base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
-    for link in soup.find_all('a'):
-        file_url = urllib.parse.urljoin(base_url, link.get('href'))
-        local_filename = os.path.join(subdir, urllib.parse.urlparse(file_url).path.split('/')[-1])
-        if not local_filename.endswith('/') and local_filename != subdir:
-            link['href'] = local_filename
-            download_file(file_url, local_filename)
-    with open(os.path.join(subdir, "index.html"), "w") as file:
-        file.write(str(soup))
-def list_files(directory_path='.'):
-    files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
     return [f for f in files if f not in EXCLUDED_FILES]
-def file_editor(file_path):
     st.write(f"Editing File: {os.path.basename(file_path)}")
-    file_content = ""
-    with open(file_path, "r") as f:
-        file_content = f.read()
-    file_content = st.text_area("Edit the file content:", value=file_content, height=250)
     if st.button("💾 Save"):
-        with open(file_path, "w") as f:
-            f.write(file_content)
-        st.success(f"File '{os.path.basename(file_path)}' saved!")
-def show_file_operations(file_path, sequence_number):
     unique_key = hashlib.md5(file_path.encode()).hexdigest()
     file_content = ""
     col01, col02, col1, col2, col3 = st.columns(5)
     with col01:
         st.write(os.path.basename(file_path))
     with col1:
         edit_key = f"edit_{unique_key}_{sequence_number}"
-        if st.button(f"✏️ Edit", key=edit_key):
-            with open(file_path, "r") as f:
-                file_content = f.read()
-            text_area_key = f"text_area_{unique_key}_{sequence_number}"
-            file_content = st.text_area("Edit the file content:", value=file_content, height=250, key=text_area_key)
     with col2:
         save_key = f"save_{unique_key}_{sequence_number}"
-        if st.button(f"💾 Save", key=save_key ):
-            if file_content:  # Ensure file_content is not empty
-                with open(file_path, "w") as f:
-                    f.write(file_content)
-                st.success(f"File saved!")
     with col3:
         delete_key = f"delete_{unique_key}_{sequence_number}"
-        if st.button(f"🗑️ Delete", key=delete_key):
-            os.remove(file_path)
-            st.markdown(f"File deleted!")
-file_sequence_numbers = {}
-def show_download_links(subdir):
     global file_sequence_numbers
     for file in list_files(subdir):
         file_path = os.path.join(subdir, file)
-        if file_path not in file_sequence_numbers:
-            file_sequence_numbers[file_path] = 1
         else:
-            file_sequence_numbers[file_path] += 1
-        sequence_number = file_sequence_numbers[file_path]
         if os.path.isfile(file_path):
             st.markdown(get_download_link(file_path), unsafe_allow_html=True)
@@ -119,66 +248,79 @@ def show_download_links(subdir):
         else:
             st.write(f"File not found: {file}")
-def get_download_link(file):
-    with open(file, "rb") as f:
-        bytes = f.read()
-        b64 = base64.b64encode(bytes).decode()
-        href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{os.path.basename(file)}\'>Download: {os.path.basename(file)}</a>'
-    return href
-def main():
     st.sidebar.title('Web Datasets Bulk Downloader')
     query_params = st.experimental_get_query_params()
     file_to_edit = query_params.get('file_to_edit', [None])[0]
     if file_to_edit and os.path.exists(file_to_edit):
         file_editor(file_to_edit)
     else:
-        url_input_method = st.sidebar.radio("Choose URL Input Method", ["Enter URL", "Select from List"])
         url = ""
         if url_input_method == "Enter URL":
-            url = st.sidebar.text_input('Please enter a Web URL to bulk download text and files')
         else:
-            selected_site = st.sidebar.selectbox("Select a Website", list(URLS.keys()))
             url = URLS[selected_site]
-        if not os.path.exists("history.json"):
-            with open("history.json", "w") as f:
-                json.dump({}, f)
-        with open("history.json", "r") as f:
-            try:
                 history = json.load(f)
-                print("History loaded:", history)  # Debugging line
-            except Exception as e:
-                print('Error loading history:', e)
         if url:
             subdir = hashlib.md5(url.encode()).hexdigest()
-            if not os.path.exists(subdir):
-                os.makedirs(subdir)
             if url not in history:
                 history[url] = subdir
-                with open("history.json", "w") as f:
-                    json.dump(history, f)
         if st.sidebar.button('📥 Get All the Content'):
-            download_html_and_files(url, history[url])
-            show_download_links(history[url])
         if st.sidebar.button('📂 Show Download Links'):
             for subdir in history.values():
                 show_download_links(subdir)
         with st.expander("URL History and Downloaded Files"):
-            try:
-                for url, subdir in history.items():
-                    st.markdown(f"#### {url}")
-                    show_download_links(subdir)
-            except Exception as e:
-                print('Error displaying history:', e)
         for subdir in history.values():
             show_download_links(subdir)

 import os
 import urllib
 import base64
+from bs4 import BeautifulSoup
 import hashlib
 import json
+import uuid
+import logging
+from typing import Optional, Dict, List, Any
+from pathlib import Path
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Constants
+EXCLUDED_FILES = [
+    'app.py', 'requirements.txt', 'pre-requirements.txt',
+    'packages.txt', 'README.md', '.gitattributes',
+    "backup.py", "Dockerfile"
+]
 URLS = {
     "Chordify - Play Along Chords": "https://chordify.net/",
     "National Guitar Academy - Guitar Learning": "https://www.guitaracademy.com/",
     "John Lennon": "https://www.ultimate-guitar.com/search.php?search_type=title&value=John%20Lennon",
 }
+def initialize_history() -> None:
+    """Initialize history.json if it doesn't exist."""
+    if not os.path.exists("history.json"):
+        with open("history.json", "w") as f:
+            json.dump({}, f)
+def download_file(url: str, local_filename: str) -> Optional[str]:
+    """
+    Download a file from a URL to a local file.
+    Args:
+        url (str): The URL to download from
+        local_filename (str): The local file path to save to
+    Returns:
+        Optional[str]: The local filename if successful, None otherwise
+    """
+    if url.startswith(('http://', 'https://')):
         try:
             with requests.get(url, stream=True) as r:
                 r.raise_for_status()
                         f.write(chunk)
             return local_filename
         except requests.exceptions.HTTPError as err:
+            logger.error(f"HTTP error occurred: {err}")
+            return None
+    return None
+def download_html_and_files(url: str, subdir: str) -> None:
+    """
+    Download HTML content and associated files from a URL.
+    Args:
+        url (str): The URL to download content from
+        subdir (str): The subdirectory to save files to
+    """
+    try:
+        os.makedirs(subdir, exist_ok=True)
+        response = requests.get(url, timeout=30)
+        response.raise_for_status()
+        html_content = response.text
+        try:
+            soup = BeautifulSoup(html_content, 'html.parser')
+        except Exception as e:
+            logger.error(f"Failed to parse HTML content: {e}")
+            st.error(f"Failed to parse HTML content from {url}")
+            return
+        base_url = urllib.parse.urlunparse(
+            urllib.parse.urlparse(url)._replace(
+                path='', params='', query='', fragment=''
+            )
+        )
+        for link in soup.find_all('a'):
+            href = link.get('href')
+            if not href:
+                continue
+            try:
+                file_url = urllib.parse.urljoin(base_url, href)
+                local_filename = os.path.join(
+                    subdir,
+                    urllib.parse.urlparse(file_url).path.split('/')[-1]
+                )
+                if not local_filename or local_filename.endswith('/'):
+                    continue
+                if local_filename != subdir:
+                    link['href'] = local_filename
+                    download_file(file_url, local_filename)
+            except Exception as e:
+                logger.error(f"Failed to process link {href}: {e}")
+                continue
+        try:
+            with open(os.path.join(subdir, "index.html"), "w", encoding='utf-8') as file:
+                file.write(str(soup))
+        except Exception as e:
+            logger.error(f"Failed to save HTML file: {e}")
+            st.error("Failed to save downloaded content")
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Failed to download content from {url}: {e}")
+        st.error(f"Failed to download content from {url}")
+    except Exception as e:
+        logger.error(f"Unexpected error while downloading content: {e}")
+        st.error("An unexpected error occurred while downloading content")
+def list_files(directory_path: str = '.') -> List[str]:
+    """List all files in directory excluding EXCLUDED_FILES."""
+    files = [f for f in os.listdir(directory_path)
+             if os.path.isfile(os.path.join(directory_path, f))]
     return [f for f in files if f not in EXCLUDED_FILES]
+def file_editor(file_path: str) -> None:
+    """Edit file content using Streamlit text area."""
     st.write(f"Editing File: {os.path.basename(file_path)}")
+    try:
+        with open(file_path, "r", encoding='utf-8') as f:
+            file_content = f.read()
+    except Exception as e:
+        logger.error(f"Failed to read file {file_path}: {e}")
+        st.error("Failed to read file")
+        return
+    edited_content = st.text_area(
+        "Edit the file content:",
+        value=file_content,
+        height=250
+    )
     if st.button("💾 Save"):
+        try:
+            with open(file_path, "w", encoding='utf-8') as f:
+                f.write(edited_content)
+            st.success(f"File '{os.path.basename(file_path)}' saved!")
+        except Exception as e:
+            logger.error(f"Failed to save file {file_path}: {e}")
+            st.error("Failed to save file")
+def show_file_operations(file_path: str, sequence_number: int) -> None:
+    """Show file operations UI for a given file."""
     unique_key = hashlib.md5(file_path.encode()).hexdigest()
     file_content = ""
     col01, col02, col1, col2, col3 = st.columns(5)
     with col01:
         st.write(os.path.basename(file_path))
     with col1:
         edit_key = f"edit_{unique_key}_{sequence_number}"
+        if st.button("✏️ Edit", key=edit_key):
+            try:
+                with open(file_path, "r", encoding='utf-8') as f:
+                    file_content = f.read()
+                text_area_key = f"text_area_{unique_key}_{sequence_number}"
+                file_content = st.text_area(
+                    "Edit the file content:",
+                    value=file_content,
+                    height=250,
+                    key=text_area_key
+                )
+            except Exception as e:
+                logger.error(f"Failed to read file {file_path}: {e}")
+                st.error("Failed to read file")
     with col2:
         save_key = f"save_{unique_key}_{sequence_number}"
+        if st.button("💾 Save", key=save_key):
+            if file_content:
+                try:
+                    with open(file_path, "w", encoding='utf-8') as f:
+                        f.write(file_content)
+                    st.success("File saved!")
+                except Exception as e:
+                    logger.error(f"Failed to save file {file_path}: {e}")
+                    st.error("Failed to save file")
     with col3:
         delete_key = f"delete_{unique_key}_{sequence_number}"
+        if st.button("🗑️ Delete", key=delete_key):
+            try:
+                os.remove(file_path)
+                st.success("File deleted!")
+            except Exception as e:
+                logger.error(f"Failed to delete file {file_path}: {e}")
+                st.error("Failed to delete file")
+def get_download_link(file: str) -> str:
+    """Generate a download link for a file."""
+    try:
+        with open(file, "rb") as f:
+            bytes_content = f.read()
+            b64 = base64.b64encode(bytes_content).decode()
+            filename = os.path.basename(file)
+            return f'<a href="data:file/octet-stream;base64,{b64}" download=\'{filename}\'>Download: {filename}</a>'
+    except Exception as e:
+        logger.error(f"Failed to create download link for {file}: {e}")
+        return f"Failed to create download link for {os.path.basename(file)}"
+def show_download_links(subdir: str) -> None:
+    """Show download links for all files in a directory."""
     global file_sequence_numbers
+    if not hasattr(show_download_links, 'file_sequence_numbers'):
+        show_download_links.file_sequence_numbers = {}
     for file in list_files(subdir):
         file_path = os.path.join(subdir, file)
+        if file_path not in show_download_links.file_sequence_numbers:
+            show_download_links.file_sequence_numbers[file_path] = 1
         else:
+            show_download_links.file_sequence_numbers[file_path] += 1
+        sequence_number = show_download_links.file_sequence_numbers[file_path]
         if os.path.isfile(file_path):
             st.markdown(get_download_link(file_path), unsafe_allow_html=True)
         else:
             st.write(f"File not found: {file}")
+def main() -> None:
+    """Main application function."""
     st.sidebar.title('Web Datasets Bulk Downloader')
+    # Initialize history file
+    initialize_history()
+    # Check for query parameters
     query_params = st.experimental_get_query_params()
     file_to_edit = query_params.get('file_to_edit', [None])[0]
     if file_to_edit and os.path.exists(file_to_edit):
         file_editor(file_to_edit)
     else:
+        # URL input method selection
+        url_input_method = st.sidebar.radio(
+            "Choose URL Input Method",
+            ["Enter URL", "Select from List"]
+        )
         url = ""
         if url_input_method == "Enter URL":
+            url = st.sidebar.text_input(
+                'Please enter a Web URL to bulk download text and files'
+            )
         else:
+            selected_site = st.sidebar.selectbox(
+                "Select a Website",
+                list(URLS.keys())
+            )
             url = URLS[selected_site]
+        # Load history
+        try:
+            with open("history.json", "r") as f:
                 history = json.load(f)
+        except Exception as e:
+            logger.error(f"Failed to load history: {e}")
+            history = {}
+        # Handle URL submission
         if url:
             subdir = hashlib.md5(url.encode()).hexdigest()
+            os.makedirs(subdir, exist_ok=True)
             if url not in history:
                 history[url] = subdir
+                try:
+                    with open("history.json", "w") as f:
+                        json.dump(history, f)
+                except Exception as e:
+                    logger.error(f"Failed to save history: {e}")
+        # Download content button
         if st.sidebar.button('📥 Get All the Content'):
+            if url:
+                download_html_and_files(url, history[url])
+                show_download_links(history[url])
+            else:
+                st.warning("Please enter or select a URL first")
+        # Show download links button
         if st.sidebar.button('📂 Show Download Links'):
             for subdir in history.values():
                 show_download_links(subdir)
+        # URL history expander
         with st.expander("URL History and Downloaded Files"):
+            for url, subdir in history.items():
+                st.markdown(f"#### {url}")
+                show_download_links(subdir)
+        # Update current files
         for subdir in history.values():
             show_download_links(subdir)