Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,7 @@ import re
|
|
| 5 |
from llama_index.core.node_parser import MarkdownNodeParser
|
| 6 |
from llama_index.core.schema import Document, MetadataMode
|
| 7 |
import textstat
|
| 8 |
-
from
|
| 9 |
|
| 10 |
# --- Core Logic Classes ---
|
| 11 |
|
|
@@ -20,7 +20,7 @@ class WebpageContentProcessor:
|
|
| 20 |
def fetch_and_convert_to_markdown(self, url: str) -> str:
|
| 21 |
"""
|
| 22 |
Fetches HTML content, removes common boilerplate tags from the entire page,
|
| 23 |
-
and then converts the remaining body content to Markdown using
|
| 24 |
"""
|
| 25 |
try:
|
| 26 |
headers = {
|
|
@@ -42,11 +42,9 @@ class WebpageContentProcessor:
|
|
| 42 |
if not content_container:
|
| 43 |
return "Error: Could not find the <body> of the webpage."
|
| 44 |
|
| 45 |
-
# --- MODIFIED:
|
| 46 |
-
#
|
| 47 |
-
|
| 48 |
-
# 2. Call the .convert() method with the HTML content.
|
| 49 |
-
markdown_output = markdown_converter_instance.convert(str(content_container))
|
| 50 |
# -----------------------------------------------
|
| 51 |
|
| 52 |
# Post-processing to clean up the resulting Markdown
|
|
@@ -59,6 +57,7 @@ class WebpageContentProcessor:
|
|
| 59 |
except requests.exceptions.RequestException as e:
|
| 60 |
return f"Error fetching the URL: {e}. Please check the URL and your connection."
|
| 61 |
except Exception as e:
|
|
|
|
| 62 |
return f"An unexpected error occurred during content processing: {e}"
|
| 63 |
|
| 64 |
def parse_markdown_into_chunks(self, markdown_content: str) -> list:
|
|
@@ -323,4 +322,4 @@ with tab2:
|
|
| 323 |
st.rerun()
|
| 324 |
|
| 325 |
st.subheader("Final Compiled Document")
|
| 326 |
-
st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")
|
|
|
|
| 5 |
from llama_index.core.node_parser import MarkdownNodeParser
|
| 6 |
from llama_index.core.schema import Document, MetadataMode
|
| 7 |
import textstat
|
| 8 |
+
from markdownify import markdownify as md # <-- MODIFIED: Switched to markdownify
|
| 9 |
|
| 10 |
# --- Core Logic Classes ---
|
| 11 |
|
|
|
|
| 20 |
def fetch_and_convert_to_markdown(self, url: str) -> str:
|
| 21 |
"""
|
| 22 |
Fetches HTML content, removes common boilerplate tags from the entire page,
|
| 23 |
+
and then converts the remaining body content to Markdown using markdownify.
|
| 24 |
"""
|
| 25 |
try:
|
| 26 |
headers = {
|
|
|
|
| 42 |
if not content_container:
|
| 43 |
return "Error: Could not find the <body> of the webpage."
|
| 44 |
|
| 45 |
+
# --- MODIFIED: Switched to markdownify for conversion ---
|
| 46 |
+
# markdownify is a simple function call.
|
| 47 |
+
markdown_output = md(str(content_container))
|
|
|
|
|
|
|
| 48 |
# -----------------------------------------------
|
| 49 |
|
| 50 |
# Post-processing to clean up the resulting Markdown
|
|
|
|
| 57 |
except requests.exceptions.RequestException as e:
|
| 58 |
return f"Error fetching the URL: {e}. Please check the URL and your connection."
|
| 59 |
except Exception as e:
|
| 60 |
+
# Added more specific error logging for debugging
|
| 61 |
return f"An unexpected error occurred during content processing: {e}"
|
| 62 |
|
| 63 |
def parse_markdown_into_chunks(self, markdown_content: str) -> list:
|
|
|
|
| 322 |
st.rerun()
|
| 323 |
|
| 324 |
st.subheader("Final Compiled Document")
|
| 325 |
+
st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")
|