DataSetGo

Sleeping

App Files Files Community

acecalisto3 commited on Oct 30, 2024

Commit

d1cc889

verified ·

1 Parent(s): 9ded418

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -27

app.py CHANGED Viewed

@@ -68,7 +68,7 @@ def download_file(url: str, local_filename: str) -> Optional[str]:
 def download_html_and_files(url: str, subdir: str) -> None:
     """
-    Download HTML content and associated files from a URL.
     Args:
         url (str): The URL to download content from
@@ -79,14 +79,28 @@ def download_html_and_files(url: str, subdir: str) -> None:
         response = requests.get(url, timeout=30)
         response.raise_for_status()
-        html_content = response.text
         try:
-            soup = BeautifulSoup(content)
         except Exception as e:
-            logger.error(f"Failed to parse HTML content: {e}")
-            st.error(f"Failed to parse HTML content from {url}")
-            return
         base_url = urllib.parse.urlunparse(
             urllib.parse.urlparse(url)._replace(
@@ -94,34 +108,72 @@ def download_html_and_files(url: str, subdir: str) -> None:
             )
         )
-        for link in soup.find_all('a'):
-            href = link.get('href')
-            if not href:
-                continue
-            try:
-                file_url = urllib.parse.urljoin(base_url, href)
-                local_filename = os.path.join(
-                    subdir,
-                    urllib.parse.urlparse(file_url).path.split('/')[-1]
-                )
-                if not local_filename or local_filename.endswith('/'):
                     continue
-                if local_filename != subdir:
-                    link['href'] = local_filename
-                    download_file(file_url, local_filename)
-            except Exception as e:
-                logger.error(f"Failed to process link {href}: {e}")
-                continue
         try:
-            with open(os.path.join(subdir, "index.html"), "w", encoding='utf-8') as file:
                 file.write(str(soup))
         except Exception as e:
-            logger.error(f"Failed to save HTML file: {e}")
             st.error("Failed to save downloaded content")
     except requests.exceptions.RequestException as e:

 def download_html_and_files(url: str, subdir: str) -> None:
     """
+    Download HTML/XML content and associated files from a URL.
     Args:
         url (str): The URL to download content from
         response = requests.get(url, timeout=30)
         response.raise_for_status()
+        content = response.text
+        # Determine if content is XML or HTML
+        is_xml = url.endswith('.xml') or '<rss' in content[:1000] or '<?xml' in content[:1000]
         try:
+            if is_xml:
+                soup = BeautifulSoup(content, 'xml')  # Use XML parser for XML content
+                st.info("Processing XML content...")
+            else:
+                soup = BeautifulSoup(content, 'html.parser')
+                st.info("Processing HTML content...")
         except Exception as e:
+            # Try alternative parser if first attempt fails
+            try:
+                soup = BeautifulSoup(content, 'lxml')
+                st.info("Using alternative parser (lxml)...")
+            except Exception as inner_e:
+                logger.error(f"Failed to parse content: {e}, {inner_e}")
+                st.error(f"Failed to parse content from {url}")
+                return
         base_url = urllib.parse.urlunparse(
             urllib.parse.urlparse(url)._replace(
             )
         )
+        # Handle links differently for XML and HTML
+        if is_xml:
+            # For XML, look for specific tags that might contain links
+            link_tags = (
+                soup.find_all('link') +
+                soup.find_all('url') +
+                soup.find_all('enclosure') +
+                soup.find_all('media:content')
+            )
+            for link in link_tags:
+                try:
+                    # Get URL from appropriate attribute
+                    href = (
+                        link.get('href') or
+                        link.get('url') or
+                        link.get('src') or
+                        link.text.strip()
+                    )
+                    if href and (href.startswith('http://') or href.startswith('https://')):
+                        file_url = href
+                        local_filename = os.path.join(
+                            subdir,
+                            urllib.parse.urlparse(file_url).path.split('/')[-1]
+                        )
+                        if local_filename and not local_filename.endswith('/'):
+                            download_file(file_url, local_filename)
+                except Exception as e:
+                    logger.error(f"Failed to process XML link: {e}")
+                    continue
+        else:
+            # Original HTML processing
+            for link in soup.find_all('a'):
+                href = link.get('href')
+                if not href:
                     continue
+                try:
+                    file_url = urllib.parse.urljoin(base_url, href)
+                    local_filename = os.path.join(
+                        subdir,
+                        urllib.parse.urlparse(file_url).path.split('/')[-1]
+                    )
+                    if not local_filename or local_filename.endswith('/'):
+                        continue
+                    if local_filename != subdir:
+                        link['href'] = local_filename
+                        download_file(file_url, local_filename)
+                except Exception as e:
+                    logger.error(f"Failed to process HTML link {href}: {e}")
+                    continue
+        # Save the processed content
         try:
+            output_filename = "feed.xml" if is_xml else "index.html"
+            with open(os.path.join(subdir, output_filename), "w", encoding='utf-8') as file:
                 file.write(str(soup))
+            st.success(f"Content saved as {output_filename}")
         except Exception as e:
+            logger.error(f"Failed to save content file: {e}")
             st.error("Failed to save downloaded content")
     except requests.exceptions.RequestException as e: