Spaces:

Em4e
/

chunk-based-text-editor

Sleeping

App Files Files Community

Em4e commited on Jun 9, 2025

Commit

259dab0

verified ·

1 Parent(s): 98ddca0

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -10

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import re
 from llama_index.core.node_parser import MarkdownNodeParser
 from llama_index.core.schema import Document, MetadataMode
 import textstat
-from markitdown import Markitdown # <-- MODIFIED: Import Markitdown
 # --- Core Logic Classes ---
@@ -15,14 +15,15 @@ class WebpageContentProcessor:
     This class is responsible for the entire content processing pipeline.
     """
     def __init__(self):
-        # --- MODIFIED: Instantiate Markitdown converter ---
-        self.markdown_converter = Markitdown()
-        # -------------------------------------------------
     def fetch_and_convert_to_markdown(self, url: str) -> str:
         """
         Fetches HTML content, removes common boilerplate tags from the entire page,
-        and then converts the remaining body content to Markdown using Markitdown.
         """
         try:
             headers = {
@@ -44,10 +45,11 @@ class WebpageContentProcessor:
             if not content_container:
                 return "Error: Could not find the <body> of the webpage."
-            # --- MODIFIED: Use Markitdown for conversion ---
-            # The .convert() method returns an object; the HTML is in the .text attribute
-            conversion_result = self.markdown_converter.convert(str(content_container))
-            markdown_output = conversion_result.text
             # -----------------------------------------------
             # Post-processing to clean up the resulting Markdown
@@ -324,4 +326,4 @@ with tab2:
             st.rerun()
     st.subheader("Final Compiled Document")
-    st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")

 from llama_index.core.node_parser import MarkdownNodeParser
 from llama_index.core.schema import Document, MetadataMode
 import textstat
+from markitdown import MarkItDown # <-- MODIFIED: Corrected class name casing
 # --- Core Logic Classes ---
     This class is responsible for the entire content processing pipeline.
     """
     def __init__(self):
+        # --- MODIFIED: Removed the converter instantiation from init ---
+        # The MarkItDown library is instantiated per-conversion.
+        pass
+        # -----------------------------------------------------------
     def fetch_and_convert_to_markdown(self, url: str) -> str:
         """
         Fetches HTML content, removes common boilerplate tags from the entire page,
+        and then converts the remaining body content to Markdown using MarkItDown.
         """
         try:
             headers = {
             if not content_container:
                 return "Error: Could not find the <body> of the webpage."
+            # --- MODIFIED: Corrected MarkItDown usage ---
+            # Instantiate the converter directly with the HTML content.
+            # The result object's 'text' attribute holds the markdown.
+            markdown_converter_instance = MarkItDown(str(content_container))
+            markdown_output = markdown_converter_instance.text
             # -----------------------------------------------
             # Post-processing to clean up the resulting Markdown
             st.rerun()
     st.subheader("Final Compiled Document")
+    st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")