Spaces:

Em4e
/

chunk-based-text-editor

Sleeping

App Files Files Community

Em4e commited on Jun 9, 2025

Commit

54f6925

verified ·

1 Parent(s): e69dbdc

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -8

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import re
 from llama_index.core.node_parser import MarkdownNodeParser
 from llama_index.core.schema import Document, MetadataMode
 import textstat
-from markitdown import MarkItDown # <-- MODIFIED: Corrected class name casing
 # --- Core Logic Classes ---
@@ -20,7 +20,7 @@ class WebpageContentProcessor:
     def fetch_and_convert_to_markdown(self, url: str) -> str:
         """
         Fetches HTML content, removes common boilerplate tags from the entire page,
-        and then converts the remaining body content to Markdown using MarkItDown.
         """
         try:
             headers = {
@@ -42,11 +42,9 @@ class WebpageContentProcessor:
             if not content_container:
                 return "Error: Could not find the <body> of the webpage."
-            # --- MODIFIED: Corrected MarkItDown usage for the installed library version ---
-            # 1. Instantiate the converter object.
-            markdown_converter_instance = MarkItDown()
-            # 2. Call the .convert() method with the HTML content.
-            markdown_output = markdown_converter_instance.convert(str(content_container))
             # -----------------------------------------------
             # Post-processing to clean up the resulting Markdown
@@ -59,6 +57,7 @@ class WebpageContentProcessor:
         except requests.exceptions.RequestException as e:
             return f"Error fetching the URL: {e}. Please check the URL and your connection."
         except Exception as e:
             return f"An unexpected error occurred during content processing: {e}"
     def parse_markdown_into_chunks(self, markdown_content: str) -> list:
@@ -323,4 +322,4 @@ with tab2:
             st.rerun()
     st.subheader("Final Compiled Document")
-    st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")

 from llama_index.core.node_parser import MarkdownNodeParser
 from llama_index.core.schema import Document, MetadataMode
 import textstat
+from markdownify import markdownify as md # <-- MODIFIED: Switched to markdownify
 # --- Core Logic Classes ---
     def fetch_and_convert_to_markdown(self, url: str) -> str:
         """
         Fetches HTML content, removes common boilerplate tags from the entire page,
+        and then converts the remaining body content to Markdown using markdownify.
         """
         try:
             headers = {
             if not content_container:
                 return "Error: Could not find the <body> of the webpage."
+            # --- MODIFIED: Switched to markdownify for conversion ---
+            # markdownify is a simple function call.
+            markdown_output = md(str(content_container))
             # -----------------------------------------------
             # Post-processing to clean up the resulting Markdown
         except requests.exceptions.RequestException as e:
             return f"Error fetching the URL: {e}. Please check the URL and your connection."
         except Exception as e:
+            # Added more specific error logging for debugging
             return f"An unexpected error occurred during content processing: {e}"
     def parse_markdown_into_chunks(self, markdown_content: str) -> list:
             st.rerun()
     st.subheader("Final Compiled Document")
+    st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")