DataSetGo

Sleeping

App Files Files Community

acecalisto3 commited on Oct 31, 2024

Commit

31b4ef8

verified ·

1 Parent(s): 7a8082e

Update app.py

Browse files

Files changed (1) hide show

app.py +251 -154

app.py CHANGED Viewed

@@ -1,17 +1,24 @@
-import os
-import json
 import requests
 import urllib
-import hashlib
 import base64
-import logging
-import streamlit as st
 from bs4 import BeautifulSoup
-from typing import Optional, List
-import feedgenerator
 import time
-from streamlit_option_menu import option_menu
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -19,9 +26,7 @@ logger = logging.getLogger(__name__)
 # Constants
 EXCLUDED_FILES = [
-    'app.py', 'requirements.txt', 'pre-requirements.txt',
-    'packages.txt', 'readme.md', '.gitattributes',
-    "backup.py", "dockerfile"
 ]
 URLS = {
@@ -38,85 +43,116 @@ URLS = {
     "john lennon": "https://www.ultimate-guitar.com/search.php?search_type=title&value=john%20lennon",
 }
-# Function to toggle dark mode
-def toggle_dark_mode():
-    if 'dark_mode' not in st.session_state:
-        st.session_state.dark_mode = False
-    if st.session_state.dark_mode:
-        st.markdown('''
-        <style>
-        .stApp {
-            background-color: #2b2b2b;
-            color: #ffffff;
-        }
-        </style>
-        ''', unsafe_allow_html=True)
-    else:
-        st.markdown('''
-        <style>
-        .stApp {
-            background-color: #ffffff;
-            color: #000000;
-        }
-        </style>
-        ''', unsafe_allow_html=True)
-# Generate RSS feed
-def generate_rss_feed():
-    feed = feedgenerator.Rss201rev2Feed(
-        title="Infinite Dataset Hub Updates",
-        link="https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub",
-        description="Latest updates from the Infinite Dataset Hub",
-        language="en"
-    )
-    for i, line in enumerate(URLS):
-        dataset_name = line
-        feed.add_item(
-            title=dataset_name,
-            link=URLS[dataset_name],
-            description=f"Link to {dataset_name}",
-            pubdate=time.gmtime(time.time() - 86400 * i)
-        )
-    return feed.writeString('utf-8')
-# Download file
-def download_file(url: str, local_filename: str) -> Optional[str]:
-    try:
-        with requests.get(url, stream=True) as r:
-            r.raise_for_status()
-            with open(local_filename, 'wb') as f:
-                for chunk in r.iter_content(chunk_size=8192):
-                    f.write(chunk)
-        logger.info(f"File downloaded successfully: {local_filename}")
-        return local_filename
-    except requests.exceptions.RequestException as err:
-        logger.error(f"Error occurred while downloading {url}: {err}")
-        return None
-# Download HTML and files
 def download_html_and_files(url: str, subdir: str) -> None:
     try:
         os.makedirs(subdir, exist_ok=True)
         response = requests.get(url, timeout=30)
         response.raise_for_status()
         content = response.text
-        soup = BeautifulSoup(content, 'html.parser')
         base_url = urllib.parse.urlunparse(
             urllib.parse.urlparse(url)._replace(
                 path='', params='', query='', fragment=''
             )
         )
-        progress_bar = st.progress(0)
-        total_links = len(soup.find_all('a'))
-        for i, link in enumerate(soup.find_all('a')):
-            href = link.get('href')
-            if href:
                 try:
                     file_url = urllib.parse.urljoin(base_url, href)
                     local_filename = os.path.join(
@@ -124,16 +160,26 @@ def download_html_and_files(url: str, subdir: str) -> None:
                         urllib.parse.urlparse(file_url).path.split('/')[-1]
                     )
                     if local_filename != subdir:
                         link['href'] = local_filename
                         download_file(file_url, local_filename)
                 except Exception as e:
                     logger.error(f"Failed to process HTML link {href}: {e}")
-            progress_bar.progress((i + 1) / total_links)
-        with open(os.path.join(subdir, "index.html"), "w", encoding='utf-8') as file:
-            file.write(str(soup))
-        st.success("Content saved as index.html")
     except requests.exceptions.RequestException as e:
         logger.error(f"Failed to download content from {url}: {e}")
         st.error(f"Failed to download content from {url}")
@@ -141,99 +187,150 @@ def download_html_and_files(url: str, subdir: str) -> None:
         logger.error(f"Unexpected error while downloading content: {e}")
         st.error("An unexpected error occurred while downloading content")
-# Show download links
-def show_download_links(subdir: str) -> None:
-    for file in os.listdir(subdir):
-        file_path = os.path.join(subdir, file)
-        if os.path.isfile(file_path):
-            st.markdown(get_download_link(file_path), unsafe_allow_html=True)
-# Get download link
 def get_download_link(file: str) -> str:
-    with open(file, "rb") as f:
-        bytes_content = f.read()
         b64 = base64.b64encode(bytes_content).decode()
         filename = os.path.basename(file)
         return f'<a href="data:file/octet-stream;base64,{b64}" download="{filename}">Download: {filename}</a>'
-# Show file browser
-def show_file_browser():
-    st.write("File Browser")
-    root_dir = "downloads"
-    if not os.path.exists(root_dir):
-        st.warning("No downloads available. Use the Content Downloader to download files.")
-        return
-    for root, dirs, files in os.walk(root_dir):
-        level = root.replace(root_dir, '').count(os.sep)
-        indent = ' ' * 4 * level
-        st.write(f"{indent}{os.path.basename(root)}/")
-        sub_indent = ' ' * 4 * (level + 1)
-        for file in files:
-            st.write(f"{sub_indent}{file}")
-# Main function
-def main():
-    st.set_page_config(page_title="RSS Feed and Content Downloader", layout="wide")
-    # Toggle dark mode
-    toggle_dark_mode()
-    # Sidebar
-    with st.sidebar:
-        st.title("Navigation")
-        selected = option_menu(
-            menu_title=None,
-            options=["RSS Feed", "Content Downloader", "File Manager"],
-            icons=["rss", "cloud-download", "folder"],
-            menu_icon="cast",
-            default_index=0,
-        )
-        # Dark mode toggle
-        st.checkbox("Dark Mode", key="dark_mode", on_change=toggle_dark_mode)
-    # Main content
-    if selected == "RSS Feed":
-        rss_feed_section()
-    elif selected == "Content Downloader":
-        content_downloader_section()
-    elif selected == "File Manager":
-        file_manager_section()
-# RSS Feed Section
-def rss_feed_section():
     st.header("RSS Feed")
     if st.button("Generate RSS Feed"):
-        with st.spinner("Generating RSS Feed..."):
-            rss_feed = generate_rss_feed()
         st.success("RSS Feed generated successfully!")
-        st.code(rss_feed, language="xml")
-        # Option to export RSS feed as XML file
-        st.download_button(
-            label="Download RSS Feed",
-            data=rss_feed,
-            file_name="rss_feed.xml",
-            mime="application/xml"
-        )
-# Content Downloader Section
-def content_downloader_section():
     st.header("Content Downloader")
     selected_url = st.selectbox("Select a URL to download content from:", list(URLS.keys()))
     subdir = st.text_input("Enter subdirectory name to save files:", "downloads")
     if st.button("Download Content"):
-        with st.spinner("Downloading content..."):
-            download_html_and_files(URLS[selected_url], subdir)
         st.success("Content downloaded successfully!")
         show_download_links(subdir)
-# File Manager Section
-def file_manager_section():
-    st.header("File Manager")
-    show_file_browser()
 if __name__ == "__main__":
-    main()

+import streamlit as st
 import requests
+import os
 import urllib
 import base64
 from bs4 import BeautifulSoup
+import hashlib
+import json
+import uuid
+import logging
+from typing import Optional, Dict, List
+from pathlib import Path
+import feedparser
 import time
+import subprocess
+# Install feedparser if not already installed
+try:
+    import feedparser
+except ImportError:
+    subprocess.check_call(['pip', 'install', 'feedparser'])
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 # Constants
 EXCLUDED_FILES = [
+    'app.py', 'requirements.txt', 'pre-requirements.txt', 'packages.txt', 'readme.md', '.gitattributes', "backup.py", "dockerfile"
 ]
 URLS = {
     "john lennon": "https://www.ultimate-guitar.com/search.php?search_type=title&value=john%20lennon",
 }
+def initialize_history() -> None:
+    """Initialize history.json if it doesn't exist."""
+    if not os.path.exists("history.json"):
+        with open("history.json", "w") as f:
+            json.dump({}, f)
+def download_file(url: str, local_filename: str) -> Optional[str]:
+    """Download a file from a URL to a local file.
+    Args:
+        url (str): The URL to download from
+        local_filename (str): The local file path to save to
+    Returns:
+        Optional[str]: The local filename if successful, None otherwise
+    """
+    if url.startswith(('http://', 'https://')):
+        try:
+            with requests.get(url, stream=True) as r:
+                r.raise_for_status()
+                with open(local_filename, 'wb') as f:
+                    for chunk in r.iter_content(chunk_size=8192):
+                        f.write(chunk)
+                return local_filename
+        except requests.exceptions.HTTPError as err:
+            logger.error(f"HTTP error occurred: {err}")
+            return None
+    return None
 def download_html_and_files(url: str, subdir: str) -> None:
+    """Download HTML/XML content and associated files from a URL.
+    Args:
+        url (str): The URL to download content from
+        subdir (str): The subdirectory to save files to
+    """
     try:
         os.makedirs(subdir, exist_ok=True)
         response = requests.get(url, timeout=30)
         response.raise_for_status()
         content = response.text
+        # Determine if content is XML or HTML
+        is_xml = url.endswith('.xml') or '<rss' in content[:1000] or '<?xml' in content[:1000]
+        try:
+            if is_xml:
+                soup = BeautifulSoup(content, 'xml')  # Use XML parser for XML content
+                st.info("Processing XML content...")
+            else:
+                soup = BeautifulSoup(content, 'html.parser')
+                st.info("Processing HTML content...")
+        except Exception as e:
+            # Try alternative parser if first attempt fails
+            try:
+                soup = BeautifulSoup(content, 'lxml')
+                st.info("Using alternative parser (lxml)...")
+            except Exception as inner_e:
+                logger.error(f"Failed to parse content: {e}, {inner_e}")
+                st.error(f"Failed to parse content from {url}")
+                return
         base_url = urllib.parse.urlunparse(
             urllib.parse.urlparse(url)._replace(
                 path='', params='', query='', fragment=''
             )
         )
+        # Handle links differently for XML and HTML
+        if is_xml:
+            # For XML, look for specific tags that might contain links
+            link_tags = (
+                soup.find_all('link') +
+                soup.find_all('url') +
+                soup.find_all('enclosure') +
+                soup.find_all('media:content')
+            )
+            for link in link_tags:
+                try:
+                    # Get URL from appropriate attribute
+                    href = (
+                        link.get('href') or
+                        link.get('url') or
+                        link.get('src') or
+                        link.text.strip()
+                    )
+                    if href and (href.startswith('http://') or href.startswith('https://')):
+                        file_url = href
+                        local_filename = os.path.join(
+                            subdir,
+                            urllib.parse.urlparse(file_url).path.split('/')[-1]
+                        )
+                        if local_filename and not local_filename.endswith('/'):
+                            download_file(file_url, local_filename)
+                except Exception as e:
+                    logger.error(f"Failed to process XML link: {e}")
+                    continue
+        else:
+            # Original HTML processing
+            for link in soup.find_all('a'):
+                href = link.get('href')
+                if not href:
+                    continue
                 try:
                     file_url = urllib.parse.urljoin(base_url, href)
                     local_filename = os.path.join(
                         urllib.parse.urlparse(file_url).path.split('/')[-1]
                     )
+                    if not local_filename or local_filename.endswith('/'):
+                        continue
                     if local_filename != subdir:
                         link['href'] = local_filename
                         download_file(file_url, local_filename)
                 except Exception as e:
                     logger.error(f"Failed to process HTML link {href}: {e}")
+                    continue
+        # Save the processed content
+        try:
+            output_filename = "feed.xml" if is_xml else "index.html"
+            with open(os.path.join(subdir, output_filename), "w", encoding='utf-8') as file:
+                file.write(str(soup))
+            st.success(f"Content saved as {output_filename}")
+        except Exception as e:
+            logger.error(f"Failed to save content file: {e}")
+            st.error("Failed to save downloaded content")
     except requests.exceptions.RequestException as e:
         logger.error(f"Failed to download content from {url}: {e}")
         st.error(f"Failed to download content from {url}")
         logger.error(f"Unexpected error while downloading content: {e}")
         st.error("An unexpected error occurred while downloading content")
+def list_files(directory_path: str = '.') -> List[str]:
+    """List all files in directory excluding EXCLUDED_FILES."""
+    files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
+    return [f for f in files if f not in EXCLUDED_FILES]
+def file_editor(file_path: str) -> None:
+    """Edit file content using Streamlit text area."""
+    st.write(f"Editing File: {os.path.basename(file_path)}")
+    try:
+        with open(file_path, "r", encoding='utf-8') as f:
+            file_content = f.read()
+    except Exception as e:
+        logger.error(f"Failed to read file {file_path}: {e}")
+        st.error("Failed to read file")
+        return
+    edited_content = st.text_area(
+        "Edit the file content:",
+        value=file_content,
+        height=250
+    )
+    if st.button("💾 Save"):
+        try:
+            with open(file_path, "w", encoding='utf-8') as f:
+                f.write(edited_content)
+            st.success(f"File '{os.path.basename(file_path)}' saved!")
+        except Exception as e:
+            logger.error(f"Failed to save file {file_path}: {e}")
+            st.error("Failed to save file")
+def show_file_operations(file_path: str, sequence_number: int) -> None:
+    """Show file operations UI for a given file."""
+    unique_key = hashlib.md5(file_path.encode()).hexdigest()
+    file_content = ""
+    col01, col02, col1, col2, col3 = st.columns(5)
+    with col01:
+        st.write(os.path.basename(file_path))
+    with col1:
+        edit_key = f"edit_{unique_key}{sequence_number}"
+        if st.button("✏️ Edit", key=edit_key):
+            try:
+                with open(file_path, "r", encoding='utf-8') as f:
+                    file_content = f.read()
+                text_area_key = f"text_area{unique_key}_{sequence_number}"
+                file_content = st.text_area(
+                    "Edit the file content:",
+                    value=file_content,
+                    height=250,
+                    key=text_area_key
+                )
+            except Exception as e:
+                logger.error(f"Failed to read file {file_path}: {e}")
+                st.error("Failed to read file")
+    with col2:
+        save_key = f"save_{unique_key}_{sequence_number}"
+        if st.button("💾 Save", key=save_key):
+            if file_content:
+                try:
+                    with open(file_path, "w", encoding='utf-8') as f:
+                        f.write(file_content)
+                    st.success("File saved!")
+                except Exception as e:
+                    logger.error(f"Failed to save file {file_path}: {e}")
+                    st.error("Failed to save file")
+    with col3:
+        delete_key = f"delete_{unique_key}_{sequence_number}"
+        if st.button("🗑️ Delete", key=delete_key):
+            try:
+                os.remove(file_path)
+                st.success("File deleted!")
+            except Exception as e:
+                logger.error(f"Failed to delete file {file_path}: {e}")
+                st.error("Failed to delete file")
 def get_download_link(file: str) -> str:
+    """Generate a download link for a file."""
+    try:
+        with open(file, "rb") as f:
+            bytes_content = f.read()
         b64 = base64.b64encode(bytes_content).decode()
         filename = os.path.basename(file)
         return f'<a href="data:file/octet-stream;base64,{b64}" download="{filename}">Download: {filename}</a>'
+    except Exception as e:
+        logger.error(f"Failed to create download link for {file}: {e}")
+        return f"Failed to create download link for {os.path.basename(file)}"
+def show_download_links(subdir: str) -> None:
+    """Show download links for all files in a directory."""
+    global file_sequence_numbers
+    if not hasattr(show_download_links, 'file_sequence_numbers'):
+        show_download_links.file_sequence_numbers = {}
+    for file in list_files(subdir):
+        file_path = os.path.join(subdir, file)
+        if file_path not in show_download_links.file_sequence_numbers:
+            show_download_links.file_sequence_numbers[file_path] = 1
+        else:
+            show_download_links.file_sequence_numbers[file_path] += 1
+        sequence_number = show_download_links.file_sequence_numbers[file_path]
+        if os.path.isfile(file_path):
+            st.markdown(get_download_link(file_path), unsafe_allow_html=True)
+            show_file_operations(file_path, sequence_number)
+        else:
+            st.write(f"File not found: {file}")
+# Generate RSS feed
+def generate_rss_feed():
+    feed = feedparser.parse("https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub/rss")
+    return feed
+def main() -> None:
+    """Main app"""
+    st.title("RSS Feed and Content Downloader")
+    # Initialize history
+    initialize_history()
+    # RSS Feed Section
     st.header("RSS Feed")
     if st.button("Generate RSS Feed"):
+        rss_feed = generate_rss_feed()
         st.success("RSS Feed generated successfully!")
+        st.write(rss_feed)
+    # Content Downloader Section
     st.header("Content Downloader")
     selected_url = st.selectbox("Select a URL to download content from:", list(URLS.keys()))
     subdir = st.text_input("Enter subdirectory name to save files:", "downloads")
     if st.button("Download Content"):
+        download_html_and_files(URLS[selected_url], subdir)
         st.success("Content downloaded successfully!")
         show_download_links(subdir)
 if __name__ == "__main__":
+    main()