acecalisto3 commited on
Commit
d1cc889
·
verified ·
1 Parent(s): 9ded418

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -27
app.py CHANGED
@@ -68,7 +68,7 @@ def download_file(url: str, local_filename: str) -> Optional[str]:
68
 
69
  def download_html_and_files(url: str, subdir: str) -> None:
70
  """
71
- Download HTML content and associated files from a URL.
72
 
73
  Args:
74
  url (str): The URL to download content from
@@ -79,14 +79,28 @@ def download_html_and_files(url: str, subdir: str) -> None:
79
 
80
  response = requests.get(url, timeout=30)
81
  response.raise_for_status()
82
- html_content = response.text
 
 
 
83
 
84
  try:
85
- soup = BeautifulSoup(content)
 
 
 
 
 
 
86
  except Exception as e:
87
- logger.error(f"Failed to parse HTML content: {e}")
88
- st.error(f"Failed to parse HTML content from {url}")
89
- return
 
 
 
 
 
90
 
91
  base_url = urllib.parse.urlunparse(
92
  urllib.parse.urlparse(url)._replace(
@@ -94,34 +108,72 @@ def download_html_and_files(url: str, subdir: str) -> None:
94
  )
95
  )
96
 
97
- for link in soup.find_all('a'):
98
- href = link.get('href')
99
- if not href:
100
- continue
101
-
102
- try:
103
- file_url = urllib.parse.urljoin(base_url, href)
104
- local_filename = os.path.join(
105
- subdir,
106
- urllib.parse.urlparse(file_url).path.split('/')[-1]
107
- )
108
-
109
- if not local_filename or local_filename.endswith('/'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  continue
111
 
112
- if local_filename != subdir:
113
- link['href'] = local_filename
114
- download_file(file_url, local_filename)
 
 
 
115
 
116
- except Exception as e:
117
- logger.error(f"Failed to process link {href}: {e}")
118
- continue
 
 
 
 
 
 
 
119
 
 
120
  try:
121
- with open(os.path.join(subdir, "index.html"), "w", encoding='utf-8') as file:
 
122
  file.write(str(soup))
 
123
  except Exception as e:
124
- logger.error(f"Failed to save HTML file: {e}")
125
  st.error("Failed to save downloaded content")
126
 
127
  except requests.exceptions.RequestException as e:
 
68
 
69
  def download_html_and_files(url: str, subdir: str) -> None:
70
  """
71
+ Download HTML/XML content and associated files from a URL.
72
 
73
  Args:
74
  url (str): The URL to download content from
 
79
 
80
  response = requests.get(url, timeout=30)
81
  response.raise_for_status()
82
+ content = response.text
83
+
84
+ # Determine if content is XML or HTML
85
+ is_xml = url.endswith('.xml') or '<rss' in content[:1000] or '<?xml' in content[:1000]
86
 
87
  try:
88
+ if is_xml:
89
+ soup = BeautifulSoup(content, 'xml') # Use XML parser for XML content
90
+ st.info("Processing XML content...")
91
+ else:
92
+ soup = BeautifulSoup(content, 'html.parser')
93
+ st.info("Processing HTML content...")
94
+
95
  except Exception as e:
96
+ # Try alternative parser if first attempt fails
97
+ try:
98
+ soup = BeautifulSoup(content, 'lxml')
99
+ st.info("Using alternative parser (lxml)...")
100
+ except Exception as inner_e:
101
+ logger.error(f"Failed to parse content: {e}, {inner_e}")
102
+ st.error(f"Failed to parse content from {url}")
103
+ return
104
 
105
  base_url = urllib.parse.urlunparse(
106
  urllib.parse.urlparse(url)._replace(
 
108
  )
109
  )
110
 
111
+ # Handle links differently for XML and HTML
112
+ if is_xml:
113
+ # For XML, look for specific tags that might contain links
114
+ link_tags = (
115
+ soup.find_all('link') +
116
+ soup.find_all('url') +
117
+ soup.find_all('enclosure') +
118
+ soup.find_all('media:content')
119
+ )
120
+
121
+ for link in link_tags:
122
+ try:
123
+ # Get URL from appropriate attribute
124
+ href = (
125
+ link.get('href') or
126
+ link.get('url') or
127
+ link.get('src') or
128
+ link.text.strip()
129
+ )
130
+
131
+ if href and (href.startswith('http://') or href.startswith('https://')):
132
+ file_url = href
133
+ local_filename = os.path.join(
134
+ subdir,
135
+ urllib.parse.urlparse(file_url).path.split('/')[-1]
136
+ )
137
+
138
+ if local_filename and not local_filename.endswith('/'):
139
+ download_file(file_url, local_filename)
140
+
141
+ except Exception as e:
142
+ logger.error(f"Failed to process XML link: {e}")
143
+ continue
144
+ else:
145
+ # Original HTML processing
146
+ for link in soup.find_all('a'):
147
+ href = link.get('href')
148
+ if not href:
149
  continue
150
 
151
+ try:
152
+ file_url = urllib.parse.urljoin(base_url, href)
153
+ local_filename = os.path.join(
154
+ subdir,
155
+ urllib.parse.urlparse(file_url).path.split('/')[-1]
156
+ )
157
 
158
+ if not local_filename or local_filename.endswith('/'):
159
+ continue
160
+
161
+ if local_filename != subdir:
162
+ link['href'] = local_filename
163
+ download_file(file_url, local_filename)
164
+
165
+ except Exception as e:
166
+ logger.error(f"Failed to process HTML link {href}: {e}")
167
+ continue
168
 
169
+ # Save the processed content
170
  try:
171
+ output_filename = "feed.xml" if is_xml else "index.html"
172
+ with open(os.path.join(subdir, output_filename), "w", encoding='utf-8') as file:
173
  file.write(str(soup))
174
+ st.success(f"Content saved as {output_filename}")
175
  except Exception as e:
176
+ logger.error(f"Failed to save content file: {e}")
177
  st.error("Failed to save downloaded content")
178
 
179
  except requests.exceptions.RequestException as e: